tcp: Clean up some functions.
[platform/kernel/linux-starfive.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98         return secure_tcp_seq(ip_hdr(skb)->daddr,
99                               ip_hdr(skb)->saddr,
100                               tcp_hdr(skb)->dest,
101                               tcp_hdr(skb)->source);
102 }
103
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         if (reuse == 2) {
117                 /* Still does not detect *everything* that goes through
118                  * lo, since we require a loopback src or dst address
119                  * or direct binding to 'lo' interface.
120                  */
121                 bool loopback = false;
122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123                         loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125                 if (tw->tw_family == AF_INET6) {
126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130                                 loopback = true;
131                 } else
132 #endif
133                 {
134                         if (ipv4_is_loopback(tw->tw_daddr) ||
135                             ipv4_is_loopback(tw->tw_rcv_saddr))
136                                 loopback = true;
137                 }
138                 if (!loopback)
139                         reuse = 0;
140         }
141
142         /* With PAWS, it is safe from the viewpoint
143            of data integrity. Even without PAWS it is safe provided sequence
144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146            Actually, the idea is close to VJ's one, only timestamp cache is
147            held not per host, but per port pair and TW bucket is used as state
148            holder.
149
150            If TW bucket has been already destroyed we fall back to VJ's scheme
151            and use initial timestamp retrieved from peer table.
152          */
153         if (tcptw->tw_ts_recent_stamp &&
154             (!twp || (reuse && time_after32(ktime_get_seconds(),
155                                             tcptw->tw_ts_recent_stamp)))) {
156                 /* In case of repair and re-using TIME-WAIT sockets we still
157                  * want to be sure that it is safe as above but honor the
158                  * sequence numbers and time stamps set as part of the repair
159                  * process.
160                  *
161                  * Without this check re-using a TIME-WAIT socket with TCP
162                  * repair would accumulate a -1 on the repair assigned
163                  * sequence number. The first time it is reused the sequence
164                  * is -1, the second time -2, etc. This fixes that issue
165                  * without appearing to create any others.
166                  */
167                 if (likely(!tp->repair)) {
168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170                         if (!seq)
171                                 seq = 1;
172                         WRITE_ONCE(tp->write_seq, seq);
173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175                 }
176                 sock_hold(sktw);
177                 return 1;
178         }
179
180         return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185                               int addr_len)
186 {
187         /* This check is replicated from tcp_v4_connect() and intended to
188          * prevent BPF program called below from accessing bytes that are out
189          * of the bound specified by user in addr_len.
190          */
191         if (addr_len < sizeof(struct sockaddr_in))
192                 return -EINVAL;
193
194         sock_owned_by_me(sk);
195
196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202         struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
203         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204         struct inet_timewait_death_row *tcp_death_row;
205         __be32 daddr, nexthop, prev_sk_rcv_saddr;
206         struct inet_sock *inet = inet_sk(sk);
207         struct tcp_sock *tp = tcp_sk(sk);
208         struct ip_options_rcu *inet_opt;
209         struct net *net = sock_net(sk);
210         __be16 orig_sport, orig_dport;
211         struct flowi4 *fl4;
212         struct rtable *rt;
213         int err;
214
215         if (addr_len < sizeof(struct sockaddr_in))
216                 return -EINVAL;
217
218         if (usin->sin_family != AF_INET)
219                 return -EAFNOSUPPORT;
220
221         nexthop = daddr = usin->sin_addr.s_addr;
222         inet_opt = rcu_dereference_protected(inet->inet_opt,
223                                              lockdep_sock_is_held(sk));
224         if (inet_opt && inet_opt->opt.srr) {
225                 if (!daddr)
226                         return -EINVAL;
227                 nexthop = inet_opt->opt.faddr;
228         }
229
230         orig_sport = inet->inet_sport;
231         orig_dport = usin->sin_port;
232         fl4 = &inet->cork.fl.u.ip4;
233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235                               orig_dport, sk);
236         if (IS_ERR(rt)) {
237                 err = PTR_ERR(rt);
238                 if (err == -ENETUNREACH)
239                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240                 return err;
241         }
242
243         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244                 ip_rt_put(rt);
245                 return -ENETUNREACH;
246         }
247
248         if (!inet_opt || !inet_opt->opt.srr)
249                 daddr = fl4->daddr;
250
251         if (!inet->inet_saddr) {
252                 if (inet_csk(sk)->icsk_bind2_hash) {
253                         prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo,
254                                                                      sk, net, inet->inet_num);
255                         prev_sk_rcv_saddr = sk->sk_rcv_saddr;
256                 }
257                 inet->inet_saddr = fl4->saddr;
258         }
259
260         sk_rcv_saddr_set(sk, inet->inet_saddr);
261
262         if (prev_addr_hashbucket) {
263                 err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
264                 if (err) {
265                         inet->inet_saddr = 0;
266                         sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
267                         ip_rt_put(rt);
268                         return err;
269                 }
270         }
271
272         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
273                 /* Reset inherited state */
274                 tp->rx_opt.ts_recent       = 0;
275                 tp->rx_opt.ts_recent_stamp = 0;
276                 if (likely(!tp->repair))
277                         WRITE_ONCE(tp->write_seq, 0);
278         }
279
280         inet->inet_dport = usin->sin_port;
281         sk_daddr_set(sk, daddr);
282
283         inet_csk(sk)->icsk_ext_hdr_len = 0;
284         if (inet_opt)
285                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
286
287         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
288
289         /* Socket identity is still unknown (sport may be zero).
290          * However we set state to SYN-SENT and not releasing socket
291          * lock select source port, enter ourselves into the hash tables and
292          * complete initialization after this.
293          */
294         tcp_set_state(sk, TCP_SYN_SENT);
295         tcp_death_row = net->ipv4.tcp_death_row;
296         err = inet_hash_connect(tcp_death_row, sk);
297         if (err)
298                 goto failure;
299
300         sk_set_txhash(sk);
301
302         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
303                                inet->inet_sport, inet->inet_dport, sk);
304         if (IS_ERR(rt)) {
305                 err = PTR_ERR(rt);
306                 rt = NULL;
307                 goto failure;
308         }
309         /* OK, now commit destination to socket.  */
310         sk->sk_gso_type = SKB_GSO_TCPV4;
311         sk_setup_caps(sk, &rt->dst);
312         rt = NULL;
313
314         if (likely(!tp->repair)) {
315                 if (!tp->write_seq)
316                         WRITE_ONCE(tp->write_seq,
317                                    secure_tcp_seq(inet->inet_saddr,
318                                                   inet->inet_daddr,
319                                                   inet->inet_sport,
320                                                   usin->sin_port));
321                 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
322                                                  inet->inet_daddr);
323         }
324
325         inet->inet_id = prandom_u32();
326
327         if (tcp_fastopen_defer_connect(sk, &err))
328                 return err;
329         if (err)
330                 goto failure;
331
332         err = tcp_connect(sk);
333
334         if (err)
335                 goto failure;
336
337         return 0;
338
339 failure:
340         /*
341          * This unhashes the socket and releases the local port,
342          * if necessary.
343          */
344         tcp_set_state(sk, TCP_CLOSE);
345         ip_rt_put(rt);
346         sk->sk_route_caps = 0;
347         inet->inet_dport = 0;
348         return err;
349 }
350 EXPORT_SYMBOL(tcp_v4_connect);
351
352 /*
353  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
354  * It can be called through tcp_release_cb() if socket was owned by user
355  * at the time tcp_v4_err() was called to handle ICMP message.
356  */
357 void tcp_v4_mtu_reduced(struct sock *sk)
358 {
359         struct inet_sock *inet = inet_sk(sk);
360         struct dst_entry *dst;
361         u32 mtu;
362
363         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
364                 return;
365         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
366         dst = inet_csk_update_pmtu(sk, mtu);
367         if (!dst)
368                 return;
369
370         /* Something is about to be wrong... Remember soft error
371          * for the case, if this connection will not able to recover.
372          */
373         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
374                 sk->sk_err_soft = EMSGSIZE;
375
376         mtu = dst_mtu(dst);
377
378         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
379             ip_sk_accept_pmtu(sk) &&
380             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
381                 tcp_sync_mss(sk, mtu);
382
383                 /* Resend the TCP packet because it's
384                  * clear that the old packet has been
385                  * dropped. This is the new "fast" path mtu
386                  * discovery.
387                  */
388                 tcp_simple_retransmit(sk);
389         } /* else let the usual retransmit timer handle it */
390 }
391 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
392
393 static void do_redirect(struct sk_buff *skb, struct sock *sk)
394 {
395         struct dst_entry *dst = __sk_dst_check(sk, 0);
396
397         if (dst)
398                 dst->ops->redirect(dst, sk, skb);
399 }
400
401
402 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
403 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
404 {
405         struct request_sock *req = inet_reqsk(sk);
406         struct net *net = sock_net(sk);
407
408         /* ICMPs are not backlogged, hence we cannot get
409          * an established socket here.
410          */
411         if (seq != tcp_rsk(req)->snt_isn) {
412                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
413         } else if (abort) {
414                 /*
415                  * Still in SYN_RECV, just remove it silently.
416                  * There is no good way to pass the error to the newly
417                  * created socket, and POSIX does not want network
418                  * errors returned from accept().
419                  */
420                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
421                 tcp_listendrop(req->rsk_listener);
422         }
423         reqsk_put(req);
424 }
425 EXPORT_SYMBOL(tcp_req_err);
426
427 /* TCP-LD (RFC 6069) logic */
428 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
429 {
430         struct inet_connection_sock *icsk = inet_csk(sk);
431         struct tcp_sock *tp = tcp_sk(sk);
432         struct sk_buff *skb;
433         s32 remaining;
434         u32 delta_us;
435
436         if (sock_owned_by_user(sk))
437                 return;
438
439         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
440             !icsk->icsk_backoff)
441                 return;
442
443         skb = tcp_rtx_queue_head(sk);
444         if (WARN_ON_ONCE(!skb))
445                 return;
446
447         icsk->icsk_backoff--;
448         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
449         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
450
451         tcp_mstamp_refresh(tp);
452         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
453         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
454
455         if (remaining > 0) {
456                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
457                                           remaining, TCP_RTO_MAX);
458         } else {
459                 /* RTO revert clocked out retransmission.
460                  * Will retransmit now.
461                  */
462                 tcp_retransmit_timer(sk);
463         }
464 }
465 EXPORT_SYMBOL(tcp_ld_RTO_revert);
466
467 /*
468  * This routine is called by the ICMP module when it gets some
469  * sort of error condition.  If err < 0 then the socket should
470  * be closed and the error returned to the user.  If err > 0
471  * it's just the icmp type << 8 | icmp code.  After adjustment
472  * header points to the first 8 bytes of the tcp header.  We need
473  * to find the appropriate port.
474  *
475  * The locking strategy used here is very "optimistic". When
476  * someone else accesses the socket the ICMP is just dropped
477  * and for some paths there is no check at all.
478  * A more general error queue to queue errors for later handling
479  * is probably better.
480  *
481  */
482
483 int tcp_v4_err(struct sk_buff *skb, u32 info)
484 {
485         const struct iphdr *iph = (const struct iphdr *)skb->data;
486         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
487         struct tcp_sock *tp;
488         struct inet_sock *inet;
489         const int type = icmp_hdr(skb)->type;
490         const int code = icmp_hdr(skb)->code;
491         struct sock *sk;
492         struct request_sock *fastopen;
493         u32 seq, snd_una;
494         int err;
495         struct net *net = dev_net(skb->dev);
496
497         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
498                                        th->dest, iph->saddr, ntohs(th->source),
499                                        inet_iif(skb), 0);
500         if (!sk) {
501                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
502                 return -ENOENT;
503         }
504         if (sk->sk_state == TCP_TIME_WAIT) {
505                 inet_twsk_put(inet_twsk(sk));
506                 return 0;
507         }
508         seq = ntohl(th->seq);
509         if (sk->sk_state == TCP_NEW_SYN_RECV) {
510                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
511                                      type == ICMP_TIME_EXCEEDED ||
512                                      (type == ICMP_DEST_UNREACH &&
513                                       (code == ICMP_NET_UNREACH ||
514                                        code == ICMP_HOST_UNREACH)));
515                 return 0;
516         }
517
518         bh_lock_sock(sk);
519         /* If too many ICMPs get dropped on busy
520          * servers this needs to be solved differently.
521          * We do take care of PMTU discovery (RFC1191) special case :
522          * we can receive locally generated ICMP messages while socket is held.
523          */
524         if (sock_owned_by_user(sk)) {
525                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
526                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
527         }
528         if (sk->sk_state == TCP_CLOSE)
529                 goto out;
530
531         if (static_branch_unlikely(&ip4_min_ttl)) {
532                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
533                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
534                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
535                         goto out;
536                 }
537         }
538
539         tp = tcp_sk(sk);
540         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
541         fastopen = rcu_dereference(tp->fastopen_rsk);
542         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
543         if (sk->sk_state != TCP_LISTEN &&
544             !between(seq, snd_una, tp->snd_nxt)) {
545                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
546                 goto out;
547         }
548
549         switch (type) {
550         case ICMP_REDIRECT:
551                 if (!sock_owned_by_user(sk))
552                         do_redirect(skb, sk);
553                 goto out;
554         case ICMP_SOURCE_QUENCH:
555                 /* Just silently ignore these. */
556                 goto out;
557         case ICMP_PARAMETERPROB:
558                 err = EPROTO;
559                 break;
560         case ICMP_DEST_UNREACH:
561                 if (code > NR_ICMP_UNREACH)
562                         goto out;
563
564                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
565                         /* We are not interested in TCP_LISTEN and open_requests
566                          * (SYN-ACKs send out by Linux are always <576bytes so
567                          * they should go through unfragmented).
568                          */
569                         if (sk->sk_state == TCP_LISTEN)
570                                 goto out;
571
572                         WRITE_ONCE(tp->mtu_info, info);
573                         if (!sock_owned_by_user(sk)) {
574                                 tcp_v4_mtu_reduced(sk);
575                         } else {
576                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
577                                         sock_hold(sk);
578                         }
579                         goto out;
580                 }
581
582                 err = icmp_err_convert[code].errno;
583                 /* check if this ICMP message allows revert of backoff.
584                  * (see RFC 6069)
585                  */
586                 if (!fastopen &&
587                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
588                         tcp_ld_RTO_revert(sk, seq);
589                 break;
590         case ICMP_TIME_EXCEEDED:
591                 err = EHOSTUNREACH;
592                 break;
593         default:
594                 goto out;
595         }
596
597         switch (sk->sk_state) {
598         case TCP_SYN_SENT:
599         case TCP_SYN_RECV:
600                 /* Only in fast or simultaneous open. If a fast open socket is
601                  * already accepted it is treated as a connected one below.
602                  */
603                 if (fastopen && !fastopen->sk)
604                         break;
605
606                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
607
608                 if (!sock_owned_by_user(sk)) {
609                         sk->sk_err = err;
610
611                         sk_error_report(sk);
612
613                         tcp_done(sk);
614                 } else {
615                         sk->sk_err_soft = err;
616                 }
617                 goto out;
618         }
619
620         /* If we've already connected we will keep trying
621          * until we time out, or the user gives up.
622          *
623          * rfc1122 4.2.3.9 allows to consider as hard errors
624          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
625          * but it is obsoleted by pmtu discovery).
626          *
627          * Note, that in modern internet, where routing is unreliable
628          * and in each dark corner broken firewalls sit, sending random
629          * errors ordered by their masters even this two messages finally lose
630          * their original sense (even Linux sends invalid PORT_UNREACHs)
631          *
632          * Now we are in compliance with RFCs.
633          *                                                      --ANK (980905)
634          */
635
636         inet = inet_sk(sk);
637         if (!sock_owned_by_user(sk) && inet->recverr) {
638                 sk->sk_err = err;
639                 sk_error_report(sk);
640         } else  { /* Only an error on timeout */
641                 sk->sk_err_soft = err;
642         }
643
644 out:
645         bh_unlock_sock(sk);
646         sock_put(sk);
647         return 0;
648 }
649
650 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
651 {
652         struct tcphdr *th = tcp_hdr(skb);
653
654         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
655         skb->csum_start = skb_transport_header(skb) - skb->head;
656         skb->csum_offset = offsetof(struct tcphdr, check);
657 }
658
659 /* This routine computes an IPv4 TCP checksum. */
660 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
661 {
662         const struct inet_sock *inet = inet_sk(sk);
663
664         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
665 }
666 EXPORT_SYMBOL(tcp_v4_send_check);
667
668 /*
669  *      This routine will send an RST to the other tcp.
670  *
671  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
672  *                    for reset.
673  *      Answer: if a packet caused RST, it is not for a socket
674  *              existing in our system, if it is matched to a socket,
675  *              it is just duplicate segment or bug in other side's TCP.
676  *              So that we build reply only basing on parameters
677  *              arrived with segment.
678  *      Exception: precedence violation. We do not implement it in any case.
679  */
680
681 #ifdef CONFIG_TCP_MD5SIG
682 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
683 #else
684 #define OPTION_BYTES sizeof(__be32)
685 #endif
686
687 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
688 {
689         const struct tcphdr *th = tcp_hdr(skb);
690         struct {
691                 struct tcphdr th;
692                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
693         } rep;
694         struct ip_reply_arg arg;
695 #ifdef CONFIG_TCP_MD5SIG
696         struct tcp_md5sig_key *key = NULL;
697         const __u8 *hash_location = NULL;
698         unsigned char newhash[16];
699         int genhash;
700         struct sock *sk1 = NULL;
701 #endif
702         u64 transmit_time = 0;
703         struct sock *ctl_sk;
704         struct net *net;
705
706         /* Never send a reset in response to a reset. */
707         if (th->rst)
708                 return;
709
710         /* If sk not NULL, it means we did a successful lookup and incoming
711          * route had to be correct. prequeue might have dropped our dst.
712          */
713         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
714                 return;
715
716         /* Swap the send and the receive. */
717         memset(&rep, 0, sizeof(rep));
718         rep.th.dest   = th->source;
719         rep.th.source = th->dest;
720         rep.th.doff   = sizeof(struct tcphdr) / 4;
721         rep.th.rst    = 1;
722
723         if (th->ack) {
724                 rep.th.seq = th->ack_seq;
725         } else {
726                 rep.th.ack = 1;
727                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
728                                        skb->len - (th->doff << 2));
729         }
730
731         memset(&arg, 0, sizeof(arg));
732         arg.iov[0].iov_base = (unsigned char *)&rep;
733         arg.iov[0].iov_len  = sizeof(rep.th);
734
735         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
736 #ifdef CONFIG_TCP_MD5SIG
737         rcu_read_lock();
738         hash_location = tcp_parse_md5sig_option(th);
739         if (sk && sk_fullsock(sk)) {
740                 const union tcp_md5_addr *addr;
741                 int l3index;
742
743                 /* sdif set, means packet ingressed via a device
744                  * in an L3 domain and inet_iif is set to it.
745                  */
746                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
747                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
749         } else if (hash_location) {
750                 const union tcp_md5_addr *addr;
751                 int sdif = tcp_v4_sdif(skb);
752                 int dif = inet_iif(skb);
753                 int l3index;
754
755                 /*
756                  * active side is lost. Try to find listening socket through
757                  * source port, and then find md5 key through listening socket.
758                  * we are not loose security here:
759                  * Incoming packet is checked with md5 hash with finding key,
760                  * no RST generated if md5 hash doesn't match.
761                  */
762                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
763                                              ip_hdr(skb)->saddr,
764                                              th->source, ip_hdr(skb)->daddr,
765                                              ntohs(th->source), dif, sdif);
766                 /* don't send rst if it can't find key */
767                 if (!sk1)
768                         goto out;
769
770                 /* sdif set, means packet ingressed via a device
771                  * in an L3 domain and dif is set to it.
772                  */
773                 l3index = sdif ? dif : 0;
774                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
775                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
776                 if (!key)
777                         goto out;
778
779
780                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
781                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
782                         goto out;
783
784         }
785
786         if (key) {
787                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
788                                    (TCPOPT_NOP << 16) |
789                                    (TCPOPT_MD5SIG << 8) |
790                                    TCPOLEN_MD5SIG);
791                 /* Update length and the length the header thinks exists */
792                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
793                 rep.th.doff = arg.iov[0].iov_len / 4;
794
795                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
796                                      key, ip_hdr(skb)->saddr,
797                                      ip_hdr(skb)->daddr, &rep.th);
798         }
799 #endif
800         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
801         if (rep.opt[0] == 0) {
802                 __be32 mrst = mptcp_reset_option(skb);
803
804                 if (mrst) {
805                         rep.opt[0] = mrst;
806                         arg.iov[0].iov_len += sizeof(mrst);
807                         rep.th.doff = arg.iov[0].iov_len / 4;
808                 }
809         }
810
811         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
812                                       ip_hdr(skb)->saddr, /* XXX */
813                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
814         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
815         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
816
817         /* When socket is gone, all binding information is lost.
818          * routing might fail in this case. No choice here, if we choose to force
819          * input interface, we will misroute in case of asymmetric route.
820          */
821         if (sk) {
822                 arg.bound_dev_if = sk->sk_bound_dev_if;
823                 if (sk_fullsock(sk))
824                         trace_tcp_send_reset(sk, skb);
825         }
826
827         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
828                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
829
830         arg.tos = ip_hdr(skb)->tos;
831         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
832         local_bh_disable();
833         ctl_sk = this_cpu_read(ipv4_tcp_sk);
834         sock_net_set(ctl_sk, net);
835         if (sk) {
836                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
837                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
838                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
839                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
840                 transmit_time = tcp_transmit_time(sk);
841                 xfrm_sk_clone_policy(ctl_sk, sk);
842         }
843         ip_send_unicast_reply(ctl_sk,
844                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
845                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
846                               &arg, arg.iov[0].iov_len,
847                               transmit_time);
848
849         ctl_sk->sk_mark = 0;
850         xfrm_sk_free_policy(ctl_sk);
851         sock_net_set(ctl_sk, &init_net);
852         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
853         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
854         local_bh_enable();
855
856 #ifdef CONFIG_TCP_MD5SIG
857 out:
858         rcu_read_unlock();
859 #endif
860 }
861
862 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
863    outside socket context is ugly, certainly. What can I do?
864  */
865
866 static void tcp_v4_send_ack(const struct sock *sk,
867                             struct sk_buff *skb, u32 seq, u32 ack,
868                             u32 win, u32 tsval, u32 tsecr, int oif,
869                             struct tcp_md5sig_key *key,
870                             int reply_flags, u8 tos)
871 {
872         const struct tcphdr *th = tcp_hdr(skb);
873         struct {
874                 struct tcphdr th;
875                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
876 #ifdef CONFIG_TCP_MD5SIG
877                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
878 #endif
879                         ];
880         } rep;
881         struct net *net = sock_net(sk);
882         struct ip_reply_arg arg;
883         struct sock *ctl_sk;
884         u64 transmit_time;
885
886         memset(&rep.th, 0, sizeof(struct tcphdr));
887         memset(&arg, 0, sizeof(arg));
888
889         arg.iov[0].iov_base = (unsigned char *)&rep;
890         arg.iov[0].iov_len  = sizeof(rep.th);
891         if (tsecr) {
892                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
893                                    (TCPOPT_TIMESTAMP << 8) |
894                                    TCPOLEN_TIMESTAMP);
895                 rep.opt[1] = htonl(tsval);
896                 rep.opt[2] = htonl(tsecr);
897                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
898         }
899
900         /* Swap the send and the receive. */
901         rep.th.dest    = th->source;
902         rep.th.source  = th->dest;
903         rep.th.doff    = arg.iov[0].iov_len / 4;
904         rep.th.seq     = htonl(seq);
905         rep.th.ack_seq = htonl(ack);
906         rep.th.ack     = 1;
907         rep.th.window  = htons(win);
908
909 #ifdef CONFIG_TCP_MD5SIG
910         if (key) {
911                 int offset = (tsecr) ? 3 : 0;
912
913                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
914                                           (TCPOPT_NOP << 16) |
915                                           (TCPOPT_MD5SIG << 8) |
916                                           TCPOLEN_MD5SIG);
917                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
918                 rep.th.doff = arg.iov[0].iov_len/4;
919
920                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
921                                     key, ip_hdr(skb)->saddr,
922                                     ip_hdr(skb)->daddr, &rep.th);
923         }
924 #endif
925         arg.flags = reply_flags;
926         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
927                                       ip_hdr(skb)->saddr, /* XXX */
928                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
929         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
930         if (oif)
931                 arg.bound_dev_if = oif;
932         arg.tos = tos;
933         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
934         local_bh_disable();
935         ctl_sk = this_cpu_read(ipv4_tcp_sk);
936         sock_net_set(ctl_sk, net);
937         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
938                            inet_twsk(sk)->tw_mark : sk->sk_mark;
939         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
940                            inet_twsk(sk)->tw_priority : sk->sk_priority;
941         transmit_time = tcp_transmit_time(sk);
942         ip_send_unicast_reply(ctl_sk,
943                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
944                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
945                               &arg, arg.iov[0].iov_len,
946                               transmit_time);
947
948         ctl_sk->sk_mark = 0;
949         sock_net_set(ctl_sk, &init_net);
950         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
951         local_bh_enable();
952 }
953
954 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
955 {
956         struct inet_timewait_sock *tw = inet_twsk(sk);
957         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
958
959         tcp_v4_send_ack(sk, skb,
960                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
961                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
962                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
963                         tcptw->tw_ts_recent,
964                         tw->tw_bound_dev_if,
965                         tcp_twsk_md5_key(tcptw),
966                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
967                         tw->tw_tos
968                         );
969
970         inet_twsk_put(tw);
971 }
972
973 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
974                                   struct request_sock *req)
975 {
976         const union tcp_md5_addr *addr;
977         int l3index;
978
979         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
980          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
981          */
982         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
983                                              tcp_sk(sk)->snd_nxt;
984
985         /* RFC 7323 2.3
986          * The window field (SEG.WND) of every outgoing segment, with the
987          * exception of <SYN> segments, MUST be right-shifted by
988          * Rcv.Wind.Shift bits:
989          */
990         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
991         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
992         tcp_v4_send_ack(sk, skb, seq,
993                         tcp_rsk(req)->rcv_nxt,
994                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
995                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
996                         req->ts_recent,
997                         0,
998                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
999                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1000                         ip_hdr(skb)->tos);
1001 }
1002
1003 /*
1004  *      Send a SYN-ACK after having received a SYN.
1005  *      This still operates on a request_sock only, not on a big
1006  *      socket.
1007  */
1008 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1009                               struct flowi *fl,
1010                               struct request_sock *req,
1011                               struct tcp_fastopen_cookie *foc,
1012                               enum tcp_synack_type synack_type,
1013                               struct sk_buff *syn_skb)
1014 {
1015         const struct inet_request_sock *ireq = inet_rsk(req);
1016         struct flowi4 fl4;
1017         int err = -1;
1018         struct sk_buff *skb;
1019         u8 tos;
1020
1021         /* First, grab a route. */
1022         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1023                 return -1;
1024
1025         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1026
1027         if (skb) {
1028                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1029
1030                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1031                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1032                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1033                                 inet_sk(sk)->tos;
1034
1035                 if (!INET_ECN_is_capable(tos) &&
1036                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1037                         tos |= INET_ECN_ECT_0;
1038
1039                 rcu_read_lock();
1040                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1041                                             ireq->ir_rmt_addr,
1042                                             rcu_dereference(ireq->ireq_opt),
1043                                             tos);
1044                 rcu_read_unlock();
1045                 err = net_xmit_eval(err);
1046         }
1047
1048         return err;
1049 }
1050
1051 /*
1052  *      IPv4 request_sock destructor.
1053  */
1054 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1055 {
1056         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1057 }
1058
1059 #ifdef CONFIG_TCP_MD5SIG
1060 /*
1061  * RFC2385 MD5 checksumming requires a mapping of
1062  * IP address->MD5 Key.
1063  * We need to maintain these in the sk structure.
1064  */
1065
1066 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1067 EXPORT_SYMBOL(tcp_md5_needed);
1068
1069 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1070 {
1071         if (!old)
1072                 return true;
1073
1074         /* l3index always overrides non-l3index */
1075         if (old->l3index && new->l3index == 0)
1076                 return false;
1077         if (old->l3index == 0 && new->l3index)
1078                 return true;
1079
1080         return old->prefixlen < new->prefixlen;
1081 }
1082
1083 /* Find the Key structure for an address.  */
1084 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1085                                            const union tcp_md5_addr *addr,
1086                                            int family)
1087 {
1088         const struct tcp_sock *tp = tcp_sk(sk);
1089         struct tcp_md5sig_key *key;
1090         const struct tcp_md5sig_info *md5sig;
1091         __be32 mask;
1092         struct tcp_md5sig_key *best_match = NULL;
1093         bool match;
1094
1095         /* caller either holds rcu_read_lock() or socket lock */
1096         md5sig = rcu_dereference_check(tp->md5sig_info,
1097                                        lockdep_sock_is_held(sk));
1098         if (!md5sig)
1099                 return NULL;
1100
1101         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1102                                  lockdep_sock_is_held(sk)) {
1103                 if (key->family != family)
1104                         continue;
1105                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1106                         continue;
1107                 if (family == AF_INET) {
1108                         mask = inet_make_mask(key->prefixlen);
1109                         match = (key->addr.a4.s_addr & mask) ==
1110                                 (addr->a4.s_addr & mask);
1111 #if IS_ENABLED(CONFIG_IPV6)
1112                 } else if (family == AF_INET6) {
1113                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1114                                                   key->prefixlen);
1115 #endif
1116                 } else {
1117                         match = false;
1118                 }
1119
1120                 if (match && better_md5_match(best_match, key))
1121                         best_match = key;
1122         }
1123         return best_match;
1124 }
1125 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1126
1127 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1128                                                       const union tcp_md5_addr *addr,
1129                                                       int family, u8 prefixlen,
1130                                                       int l3index, u8 flags)
1131 {
1132         const struct tcp_sock *tp = tcp_sk(sk);
1133         struct tcp_md5sig_key *key;
1134         unsigned int size = sizeof(struct in_addr);
1135         const struct tcp_md5sig_info *md5sig;
1136
1137         /* caller either holds rcu_read_lock() or socket lock */
1138         md5sig = rcu_dereference_check(tp->md5sig_info,
1139                                        lockdep_sock_is_held(sk));
1140         if (!md5sig)
1141                 return NULL;
1142 #if IS_ENABLED(CONFIG_IPV6)
1143         if (family == AF_INET6)
1144                 size = sizeof(struct in6_addr);
1145 #endif
1146         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1147                                  lockdep_sock_is_held(sk)) {
1148                 if (key->family != family)
1149                         continue;
1150                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1151                         continue;
1152                 if (key->l3index != l3index)
1153                         continue;
1154                 if (!memcmp(&key->addr, addr, size) &&
1155                     key->prefixlen == prefixlen)
1156                         return key;
1157         }
1158         return NULL;
1159 }
1160
1161 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1162                                          const struct sock *addr_sk)
1163 {
1164         const union tcp_md5_addr *addr;
1165         int l3index;
1166
1167         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1168                                                  addr_sk->sk_bound_dev_if);
1169         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1170         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1171 }
1172 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1173
1174 /* This can be called on a newly created socket, from other files */
1175 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1176                    int family, u8 prefixlen, int l3index, u8 flags,
1177                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1178 {
1179         /* Add Key to the list */
1180         struct tcp_md5sig_key *key;
1181         struct tcp_sock *tp = tcp_sk(sk);
1182         struct tcp_md5sig_info *md5sig;
1183
1184         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1185         if (key) {
1186                 /* Pre-existing entry - just update that one.
1187                  * Note that the key might be used concurrently.
1188                  * data_race() is telling kcsan that we do not care of
1189                  * key mismatches, since changing MD5 key on live flows
1190                  * can lead to packet drops.
1191                  */
1192                 data_race(memcpy(key->key, newkey, newkeylen));
1193
1194                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1195                  * Also note that a reader could catch new key->keylen value
1196                  * but old key->key[], this is the reason we use __GFP_ZERO
1197                  * at sock_kmalloc() time below these lines.
1198                  */
1199                 WRITE_ONCE(key->keylen, newkeylen);
1200
1201                 return 0;
1202         }
1203
1204         md5sig = rcu_dereference_protected(tp->md5sig_info,
1205                                            lockdep_sock_is_held(sk));
1206         if (!md5sig) {
1207                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1208                 if (!md5sig)
1209                         return -ENOMEM;
1210
1211                 sk_gso_disable(sk);
1212                 INIT_HLIST_HEAD(&md5sig->head);
1213                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1214         }
1215
1216         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1217         if (!key)
1218                 return -ENOMEM;
1219         if (!tcp_alloc_md5sig_pool()) {
1220                 sock_kfree_s(sk, key, sizeof(*key));
1221                 return -ENOMEM;
1222         }
1223
1224         memcpy(key->key, newkey, newkeylen);
1225         key->keylen = newkeylen;
1226         key->family = family;
1227         key->prefixlen = prefixlen;
1228         key->l3index = l3index;
1229         key->flags = flags;
1230         memcpy(&key->addr, addr,
1231                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1232                                                                  sizeof(struct in_addr));
1233         hlist_add_head_rcu(&key->node, &md5sig->head);
1234         return 0;
1235 }
1236 EXPORT_SYMBOL(tcp_md5_do_add);
1237
1238 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1239                    u8 prefixlen, int l3index, u8 flags)
1240 {
1241         struct tcp_md5sig_key *key;
1242
1243         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1244         if (!key)
1245                 return -ENOENT;
1246         hlist_del_rcu(&key->node);
1247         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1248         kfree_rcu(key, rcu);
1249         return 0;
1250 }
1251 EXPORT_SYMBOL(tcp_md5_do_del);
1252
1253 static void tcp_clear_md5_list(struct sock *sk)
1254 {
1255         struct tcp_sock *tp = tcp_sk(sk);
1256         struct tcp_md5sig_key *key;
1257         struct hlist_node *n;
1258         struct tcp_md5sig_info *md5sig;
1259
1260         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1261
1262         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1263                 hlist_del_rcu(&key->node);
1264                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1265                 kfree_rcu(key, rcu);
1266         }
1267 }
1268
1269 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1270                                  sockptr_t optval, int optlen)
1271 {
1272         struct tcp_md5sig cmd;
1273         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1274         const union tcp_md5_addr *addr;
1275         u8 prefixlen = 32;
1276         int l3index = 0;
1277         u8 flags;
1278
1279         if (optlen < sizeof(cmd))
1280                 return -EINVAL;
1281
1282         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1283                 return -EFAULT;
1284
1285         if (sin->sin_family != AF_INET)
1286                 return -EINVAL;
1287
1288         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1289
1290         if (optname == TCP_MD5SIG_EXT &&
1291             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1292                 prefixlen = cmd.tcpm_prefixlen;
1293                 if (prefixlen > 32)
1294                         return -EINVAL;
1295         }
1296
1297         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1298             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1299                 struct net_device *dev;
1300
1301                 rcu_read_lock();
1302                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1303                 if (dev && netif_is_l3_master(dev))
1304                         l3index = dev->ifindex;
1305
1306                 rcu_read_unlock();
1307
1308                 /* ok to reference set/not set outside of rcu;
1309                  * right now device MUST be an L3 master
1310                  */
1311                 if (!dev || !l3index)
1312                         return -EINVAL;
1313         }
1314
1315         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1316
1317         if (!cmd.tcpm_keylen)
1318                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1319
1320         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1321                 return -EINVAL;
1322
1323         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1324                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1325 }
1326
1327 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1328                                    __be32 daddr, __be32 saddr,
1329                                    const struct tcphdr *th, int nbytes)
1330 {
1331         struct tcp4_pseudohdr *bp;
1332         struct scatterlist sg;
1333         struct tcphdr *_th;
1334
1335         bp = hp->scratch;
1336         bp->saddr = saddr;
1337         bp->daddr = daddr;
1338         bp->pad = 0;
1339         bp->protocol = IPPROTO_TCP;
1340         bp->len = cpu_to_be16(nbytes);
1341
1342         _th = (struct tcphdr *)(bp + 1);
1343         memcpy(_th, th, sizeof(*th));
1344         _th->check = 0;
1345
1346         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1347         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1348                                 sizeof(*bp) + sizeof(*th));
1349         return crypto_ahash_update(hp->md5_req);
1350 }
1351
1352 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1353                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1354 {
1355         struct tcp_md5sig_pool *hp;
1356         struct ahash_request *req;
1357
1358         hp = tcp_get_md5sig_pool();
1359         if (!hp)
1360                 goto clear_hash_noput;
1361         req = hp->md5_req;
1362
1363         if (crypto_ahash_init(req))
1364                 goto clear_hash;
1365         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1366                 goto clear_hash;
1367         if (tcp_md5_hash_key(hp, key))
1368                 goto clear_hash;
1369         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1370         if (crypto_ahash_final(req))
1371                 goto clear_hash;
1372
1373         tcp_put_md5sig_pool();
1374         return 0;
1375
1376 clear_hash:
1377         tcp_put_md5sig_pool();
1378 clear_hash_noput:
1379         memset(md5_hash, 0, 16);
1380         return 1;
1381 }
1382
1383 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1384                         const struct sock *sk,
1385                         const struct sk_buff *skb)
1386 {
1387         struct tcp_md5sig_pool *hp;
1388         struct ahash_request *req;
1389         const struct tcphdr *th = tcp_hdr(skb);
1390         __be32 saddr, daddr;
1391
1392         if (sk) { /* valid for establish/request sockets */
1393                 saddr = sk->sk_rcv_saddr;
1394                 daddr = sk->sk_daddr;
1395         } else {
1396                 const struct iphdr *iph = ip_hdr(skb);
1397                 saddr = iph->saddr;
1398                 daddr = iph->daddr;
1399         }
1400
1401         hp = tcp_get_md5sig_pool();
1402         if (!hp)
1403                 goto clear_hash_noput;
1404         req = hp->md5_req;
1405
1406         if (crypto_ahash_init(req))
1407                 goto clear_hash;
1408
1409         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1410                 goto clear_hash;
1411         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1412                 goto clear_hash;
1413         if (tcp_md5_hash_key(hp, key))
1414                 goto clear_hash;
1415         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1416         if (crypto_ahash_final(req))
1417                 goto clear_hash;
1418
1419         tcp_put_md5sig_pool();
1420         return 0;
1421
1422 clear_hash:
1423         tcp_put_md5sig_pool();
1424 clear_hash_noput:
1425         memset(md5_hash, 0, 16);
1426         return 1;
1427 }
1428 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1429
1430 #endif
1431
1432 static void tcp_v4_init_req(struct request_sock *req,
1433                             const struct sock *sk_listener,
1434                             struct sk_buff *skb)
1435 {
1436         struct inet_request_sock *ireq = inet_rsk(req);
1437         struct net *net = sock_net(sk_listener);
1438
1439         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1440         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1441         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1442 }
1443
1444 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1445                                           struct sk_buff *skb,
1446                                           struct flowi *fl,
1447                                           struct request_sock *req)
1448 {
1449         tcp_v4_init_req(req, sk, skb);
1450
1451         if (security_inet_conn_request(sk, skb, req))
1452                 return NULL;
1453
1454         return inet_csk_route_req(sk, &fl->u.ip4, req);
1455 }
1456
1457 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1458         .family         =       PF_INET,
1459         .obj_size       =       sizeof(struct tcp_request_sock),
1460         .rtx_syn_ack    =       tcp_rtx_synack,
1461         .send_ack       =       tcp_v4_reqsk_send_ack,
1462         .destructor     =       tcp_v4_reqsk_destructor,
1463         .send_reset     =       tcp_v4_send_reset,
1464         .syn_ack_timeout =      tcp_syn_ack_timeout,
1465 };
1466
1467 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1468         .mss_clamp      =       TCP_MSS_DEFAULT,
1469 #ifdef CONFIG_TCP_MD5SIG
1470         .req_md5_lookup =       tcp_v4_md5_lookup,
1471         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1472 #endif
1473 #ifdef CONFIG_SYN_COOKIES
1474         .cookie_init_seq =      cookie_v4_init_sequence,
1475 #endif
1476         .route_req      =       tcp_v4_route_req,
1477         .init_seq       =       tcp_v4_init_seq,
1478         .init_ts_off    =       tcp_v4_init_ts_off,
1479         .send_synack    =       tcp_v4_send_synack,
1480 };
1481
1482 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1483 {
1484         /* Never answer to SYNs send to broadcast or multicast */
1485         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1486                 goto drop;
1487
1488         return tcp_conn_request(&tcp_request_sock_ops,
1489                                 &tcp_request_sock_ipv4_ops, sk, skb);
1490
1491 drop:
1492         tcp_listendrop(sk);
1493         return 0;
1494 }
1495 EXPORT_SYMBOL(tcp_v4_conn_request);
1496
1497
1498 /*
1499  * The three way handshake has completed - we got a valid synack -
1500  * now create the new socket.
1501  */
1502 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1503                                   struct request_sock *req,
1504                                   struct dst_entry *dst,
1505                                   struct request_sock *req_unhash,
1506                                   bool *own_req)
1507 {
1508         struct inet_request_sock *ireq;
1509         bool found_dup_sk = false;
1510         struct inet_sock *newinet;
1511         struct tcp_sock *newtp;
1512         struct sock *newsk;
1513 #ifdef CONFIG_TCP_MD5SIG
1514         const union tcp_md5_addr *addr;
1515         struct tcp_md5sig_key *key;
1516         int l3index;
1517 #endif
1518         struct ip_options_rcu *inet_opt;
1519
1520         if (sk_acceptq_is_full(sk))
1521                 goto exit_overflow;
1522
1523         newsk = tcp_create_openreq_child(sk, req, skb);
1524         if (!newsk)
1525                 goto exit_nonewsk;
1526
1527         newsk->sk_gso_type = SKB_GSO_TCPV4;
1528         inet_sk_rx_dst_set(newsk, skb);
1529
1530         newtp                 = tcp_sk(newsk);
1531         newinet               = inet_sk(newsk);
1532         ireq                  = inet_rsk(req);
1533         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1534         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1535         newsk->sk_bound_dev_if = ireq->ir_iif;
1536         newinet->inet_saddr   = ireq->ir_loc_addr;
1537         inet_opt              = rcu_dereference(ireq->ireq_opt);
1538         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1539         newinet->mc_index     = inet_iif(skb);
1540         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1541         newinet->rcv_tos      = ip_hdr(skb)->tos;
1542         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1543         if (inet_opt)
1544                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1545         newinet->inet_id = prandom_u32();
1546
1547         /* Set ToS of the new socket based upon the value of incoming SYN.
1548          * ECT bits are set later in tcp_init_transfer().
1549          */
1550         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1551                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1552
1553         if (!dst) {
1554                 dst = inet_csk_route_child_sock(sk, newsk, req);
1555                 if (!dst)
1556                         goto put_and_exit;
1557         } else {
1558                 /* syncookie case : see end of cookie_v4_check() */
1559         }
1560         sk_setup_caps(newsk, dst);
1561
1562         tcp_ca_openreq_child(newsk, dst);
1563
1564         tcp_sync_mss(newsk, dst_mtu(dst));
1565         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1566
1567         tcp_initialize_rcv_mss(newsk);
1568
1569 #ifdef CONFIG_TCP_MD5SIG
1570         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1571         /* Copy over the MD5 key from the original socket */
1572         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1573         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1574         if (key) {
1575                 /*
1576                  * We're using one, so create a matching key
1577                  * on the newsk structure. If we fail to get
1578                  * memory, then we end up not copying the key
1579                  * across. Shucks.
1580                  */
1581                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1582                                key->key, key->keylen, GFP_ATOMIC);
1583                 sk_gso_disable(newsk);
1584         }
1585 #endif
1586
1587         if (__inet_inherit_port(sk, newsk) < 0)
1588                 goto put_and_exit;
1589         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1590                                        &found_dup_sk);
1591         if (likely(*own_req)) {
1592                 tcp_move_syn(newtp, req);
1593                 ireq->ireq_opt = NULL;
1594         } else {
1595                 newinet->inet_opt = NULL;
1596
1597                 if (!req_unhash && found_dup_sk) {
1598                         /* This code path should only be executed in the
1599                          * syncookie case only
1600                          */
1601                         bh_unlock_sock(newsk);
1602                         sock_put(newsk);
1603                         newsk = NULL;
1604                 }
1605         }
1606         return newsk;
1607
1608 exit_overflow:
1609         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1610 exit_nonewsk:
1611         dst_release(dst);
1612 exit:
1613         tcp_listendrop(sk);
1614         return NULL;
1615 put_and_exit:
1616         newinet->inet_opt = NULL;
1617         inet_csk_prepare_forced_close(newsk);
1618         tcp_done(newsk);
1619         goto exit;
1620 }
1621 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1622
1623 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1624 {
1625 #ifdef CONFIG_SYN_COOKIES
1626         const struct tcphdr *th = tcp_hdr(skb);
1627
1628         if (!th->syn)
1629                 sk = cookie_v4_check(sk, skb);
1630 #endif
1631         return sk;
1632 }
1633
1634 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1635                          struct tcphdr *th, u32 *cookie)
1636 {
1637         u16 mss = 0;
1638 #ifdef CONFIG_SYN_COOKIES
1639         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1640                                     &tcp_request_sock_ipv4_ops, sk, th);
1641         if (mss) {
1642                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1643                 tcp_synq_overflow(sk);
1644         }
1645 #endif
1646         return mss;
1647 }
1648
1649 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1650                                                            u32));
1651 /* The socket must have it's spinlock held when we get
1652  * here, unless it is a TCP_LISTEN socket.
1653  *
1654  * We have a potential double-lock case here, so even when
1655  * doing backlog processing we use the BH locking scheme.
1656  * This is because we cannot sleep with the original spinlock
1657  * held.
1658  */
1659 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1660 {
1661         enum skb_drop_reason reason;
1662         struct sock *rsk;
1663
1664         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1665                 struct dst_entry *dst;
1666
1667                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1668                                                 lockdep_sock_is_held(sk));
1669
1670                 sock_rps_save_rxhash(sk, skb);
1671                 sk_mark_napi_id(sk, skb);
1672                 if (dst) {
1673                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1674                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1675                                              dst, 0)) {
1676                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1677                                 dst_release(dst);
1678                         }
1679                 }
1680                 tcp_rcv_established(sk, skb);
1681                 return 0;
1682         }
1683
1684         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1685         if (tcp_checksum_complete(skb))
1686                 goto csum_err;
1687
1688         if (sk->sk_state == TCP_LISTEN) {
1689                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1690
1691                 if (!nsk)
1692                         goto discard;
1693                 if (nsk != sk) {
1694                         if (tcp_child_process(sk, nsk, skb)) {
1695                                 rsk = nsk;
1696                                 goto reset;
1697                         }
1698                         return 0;
1699                 }
1700         } else
1701                 sock_rps_save_rxhash(sk, skb);
1702
1703         if (tcp_rcv_state_process(sk, skb)) {
1704                 rsk = sk;
1705                 goto reset;
1706         }
1707         return 0;
1708
1709 reset:
1710         tcp_v4_send_reset(rsk, skb);
1711 discard:
1712         kfree_skb_reason(skb, reason);
1713         /* Be careful here. If this function gets more complicated and
1714          * gcc suffers from register pressure on the x86, sk (in %ebx)
1715          * might be destroyed here. This current version compiles correctly,
1716          * but you have been warned.
1717          */
1718         return 0;
1719
1720 csum_err:
1721         reason = SKB_DROP_REASON_TCP_CSUM;
1722         trace_tcp_bad_csum(skb);
1723         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1724         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1725         goto discard;
1726 }
1727 EXPORT_SYMBOL(tcp_v4_do_rcv);
1728
1729 int tcp_v4_early_demux(struct sk_buff *skb)
1730 {
1731         const struct iphdr *iph;
1732         const struct tcphdr *th;
1733         struct sock *sk;
1734
1735         if (skb->pkt_type != PACKET_HOST)
1736                 return 0;
1737
1738         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1739                 return 0;
1740
1741         iph = ip_hdr(skb);
1742         th = tcp_hdr(skb);
1743
1744         if (th->doff < sizeof(struct tcphdr) / 4)
1745                 return 0;
1746
1747         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1748                                        iph->saddr, th->source,
1749                                        iph->daddr, ntohs(th->dest),
1750                                        skb->skb_iif, inet_sdif(skb));
1751         if (sk) {
1752                 skb->sk = sk;
1753                 skb->destructor = sock_edemux;
1754                 if (sk_fullsock(sk)) {
1755                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1756
1757                         if (dst)
1758                                 dst = dst_check(dst, 0);
1759                         if (dst &&
1760                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1761                                 skb_dst_set_noref(skb, dst);
1762                 }
1763         }
1764         return 0;
1765 }
1766
1767 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1768                      enum skb_drop_reason *reason)
1769 {
1770         u32 limit, tail_gso_size, tail_gso_segs;
1771         struct skb_shared_info *shinfo;
1772         const struct tcphdr *th;
1773         struct tcphdr *thtail;
1774         struct sk_buff *tail;
1775         unsigned int hdrlen;
1776         bool fragstolen;
1777         u32 gso_segs;
1778         u32 gso_size;
1779         int delta;
1780
1781         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1782          * we can fix skb->truesize to its real value to avoid future drops.
1783          * This is valid because skb is not yet charged to the socket.
1784          * It has been noticed pure SACK packets were sometimes dropped
1785          * (if cooked by drivers without copybreak feature).
1786          */
1787         skb_condense(skb);
1788
1789         skb_dst_drop(skb);
1790
1791         if (unlikely(tcp_checksum_complete(skb))) {
1792                 bh_unlock_sock(sk);
1793                 trace_tcp_bad_csum(skb);
1794                 *reason = SKB_DROP_REASON_TCP_CSUM;
1795                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1796                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1797                 return true;
1798         }
1799
1800         /* Attempt coalescing to last skb in backlog, even if we are
1801          * above the limits.
1802          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1803          */
1804         th = (const struct tcphdr *)skb->data;
1805         hdrlen = th->doff * 4;
1806
1807         tail = sk->sk_backlog.tail;
1808         if (!tail)
1809                 goto no_coalesce;
1810         thtail = (struct tcphdr *)tail->data;
1811
1812         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1813             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1814             ((TCP_SKB_CB(tail)->tcp_flags |
1815               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1816             !((TCP_SKB_CB(tail)->tcp_flags &
1817               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1818             ((TCP_SKB_CB(tail)->tcp_flags ^
1819               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1820 #ifdef CONFIG_TLS_DEVICE
1821             tail->decrypted != skb->decrypted ||
1822 #endif
1823             thtail->doff != th->doff ||
1824             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1825                 goto no_coalesce;
1826
1827         __skb_pull(skb, hdrlen);
1828
1829         shinfo = skb_shinfo(skb);
1830         gso_size = shinfo->gso_size ?: skb->len;
1831         gso_segs = shinfo->gso_segs ?: 1;
1832
1833         shinfo = skb_shinfo(tail);
1834         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1835         tail_gso_segs = shinfo->gso_segs ?: 1;
1836
1837         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1838                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1839
1840                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1841                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1842                         thtail->window = th->window;
1843                 }
1844
1845                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1846                  * thtail->fin, so that the fast path in tcp_rcv_established()
1847                  * is not entered if we append a packet with a FIN.
1848                  * SYN, RST, URG are not present.
1849                  * ACK is set on both packets.
1850                  * PSH : we do not really care in TCP stack,
1851                  *       at least for 'GRO' packets.
1852                  */
1853                 thtail->fin |= th->fin;
1854                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1855
1856                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1857                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1858                         tail->tstamp = skb->tstamp;
1859                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1860                 }
1861
1862                 /* Not as strict as GRO. We only need to carry mss max value */
1863                 shinfo->gso_size = max(gso_size, tail_gso_size);
1864                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1865
1866                 sk->sk_backlog.len += delta;
1867                 __NET_INC_STATS(sock_net(sk),
1868                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1869                 kfree_skb_partial(skb, fragstolen);
1870                 return false;
1871         }
1872         __skb_push(skb, hdrlen);
1873
1874 no_coalesce:
1875         /* Only socket owner can try to collapse/prune rx queues
1876          * to reduce memory overhead, so add a little headroom here.
1877          * Few sockets backlog are possibly concurrently non empty.
1878          */
1879         limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1880
1881         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1882                 bh_unlock_sock(sk);
1883                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1884                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1885                 return true;
1886         }
1887         return false;
1888 }
1889 EXPORT_SYMBOL(tcp_add_backlog);
1890
1891 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1892 {
1893         struct tcphdr *th = (struct tcphdr *)skb->data;
1894
1895         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1896 }
1897 EXPORT_SYMBOL(tcp_filter);
1898
1899 static void tcp_v4_restore_cb(struct sk_buff *skb)
1900 {
1901         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1902                 sizeof(struct inet_skb_parm));
1903 }
1904
1905 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1906                            const struct tcphdr *th)
1907 {
1908         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1909          * barrier() makes sure compiler wont play fool^Waliasing games.
1910          */
1911         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1912                 sizeof(struct inet_skb_parm));
1913         barrier();
1914
1915         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1916         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1917                                     skb->len - th->doff * 4);
1918         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1919         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1920         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1921         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1922         TCP_SKB_CB(skb)->sacked  = 0;
1923         TCP_SKB_CB(skb)->has_rxtstamp =
1924                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1925 }
1926
1927 /*
1928  *      From tcp_input.c
1929  */
1930
1931 int tcp_v4_rcv(struct sk_buff *skb)
1932 {
1933         struct net *net = dev_net(skb->dev);
1934         enum skb_drop_reason drop_reason;
1935         int sdif = inet_sdif(skb);
1936         int dif = inet_iif(skb);
1937         const struct iphdr *iph;
1938         const struct tcphdr *th;
1939         bool refcounted;
1940         struct sock *sk;
1941         int ret;
1942
1943         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1944         if (skb->pkt_type != PACKET_HOST)
1945                 goto discard_it;
1946
1947         /* Count it even if it's bad */
1948         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1949
1950         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1951                 goto discard_it;
1952
1953         th = (const struct tcphdr *)skb->data;
1954
1955         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1956                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1957                 goto bad_packet;
1958         }
1959         if (!pskb_may_pull(skb, th->doff * 4))
1960                 goto discard_it;
1961
1962         /* An explanation is required here, I think.
1963          * Packet length and doff are validated by header prediction,
1964          * provided case of th->doff==0 is eliminated.
1965          * So, we defer the checks. */
1966
1967         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1968                 goto csum_error;
1969
1970         th = (const struct tcphdr *)skb->data;
1971         iph = ip_hdr(skb);
1972 lookup:
1973         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1974                                th->dest, sdif, &refcounted);
1975         if (!sk)
1976                 goto no_tcp_socket;
1977
1978 process:
1979         if (sk->sk_state == TCP_TIME_WAIT)
1980                 goto do_time_wait;
1981
1982         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1983                 struct request_sock *req = inet_reqsk(sk);
1984                 bool req_stolen = false;
1985                 struct sock *nsk;
1986
1987                 sk = req->rsk_listener;
1988                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1989                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1990                 else
1991                         drop_reason = tcp_inbound_md5_hash(sk, skb,
1992                                                    &iph->saddr, &iph->daddr,
1993                                                    AF_INET, dif, sdif);
1994                 if (unlikely(drop_reason)) {
1995                         sk_drops_add(sk, skb);
1996                         reqsk_put(req);
1997                         goto discard_it;
1998                 }
1999                 if (tcp_checksum_complete(skb)) {
2000                         reqsk_put(req);
2001                         goto csum_error;
2002                 }
2003                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2004                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2005                         if (!nsk) {
2006                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2007                                 goto lookup;
2008                         }
2009                         sk = nsk;
2010                         /* reuseport_migrate_sock() has already held one sk_refcnt
2011                          * before returning.
2012                          */
2013                 } else {
2014                         /* We own a reference on the listener, increase it again
2015                          * as we might lose it too soon.
2016                          */
2017                         sock_hold(sk);
2018                 }
2019                 refcounted = true;
2020                 nsk = NULL;
2021                 if (!tcp_filter(sk, skb)) {
2022                         th = (const struct tcphdr *)skb->data;
2023                         iph = ip_hdr(skb);
2024                         tcp_v4_fill_cb(skb, iph, th);
2025                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2026                 } else {
2027                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2028                 }
2029                 if (!nsk) {
2030                         reqsk_put(req);
2031                         if (req_stolen) {
2032                                 /* Another cpu got exclusive access to req
2033                                  * and created a full blown socket.
2034                                  * Try to feed this packet to this socket
2035                                  * instead of discarding it.
2036                                  */
2037                                 tcp_v4_restore_cb(skb);
2038                                 sock_put(sk);
2039                                 goto lookup;
2040                         }
2041                         goto discard_and_relse;
2042                 }
2043                 nf_reset_ct(skb);
2044                 if (nsk == sk) {
2045                         reqsk_put(req);
2046                         tcp_v4_restore_cb(skb);
2047                 } else if (tcp_child_process(sk, nsk, skb)) {
2048                         tcp_v4_send_reset(nsk, skb);
2049                         goto discard_and_relse;
2050                 } else {
2051                         sock_put(sk);
2052                         return 0;
2053                 }
2054         }
2055
2056         if (static_branch_unlikely(&ip4_min_ttl)) {
2057                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2058                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2059                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2060                         goto discard_and_relse;
2061                 }
2062         }
2063
2064         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2065                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2066                 goto discard_and_relse;
2067         }
2068
2069         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2070                                            &iph->daddr, AF_INET, dif, sdif);
2071         if (drop_reason)
2072                 goto discard_and_relse;
2073
2074         nf_reset_ct(skb);
2075
2076         if (tcp_filter(sk, skb)) {
2077                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2078                 goto discard_and_relse;
2079         }
2080         th = (const struct tcphdr *)skb->data;
2081         iph = ip_hdr(skb);
2082         tcp_v4_fill_cb(skb, iph, th);
2083
2084         skb->dev = NULL;
2085
2086         if (sk->sk_state == TCP_LISTEN) {
2087                 ret = tcp_v4_do_rcv(sk, skb);
2088                 goto put_and_return;
2089         }
2090
2091         sk_incoming_cpu_update(sk);
2092
2093         bh_lock_sock_nested(sk);
2094         tcp_segs_in(tcp_sk(sk), skb);
2095         ret = 0;
2096         if (!sock_owned_by_user(sk)) {
2097                 ret = tcp_v4_do_rcv(sk, skb);
2098         } else {
2099                 if (tcp_add_backlog(sk, skb, &drop_reason))
2100                         goto discard_and_relse;
2101         }
2102         bh_unlock_sock(sk);
2103
2104 put_and_return:
2105         if (refcounted)
2106                 sock_put(sk);
2107
2108         return ret;
2109
2110 no_tcp_socket:
2111         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2112         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2113                 goto discard_it;
2114
2115         tcp_v4_fill_cb(skb, iph, th);
2116
2117         if (tcp_checksum_complete(skb)) {
2118 csum_error:
2119                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2120                 trace_tcp_bad_csum(skb);
2121                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2122 bad_packet:
2123                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2124         } else {
2125                 tcp_v4_send_reset(NULL, skb);
2126         }
2127
2128 discard_it:
2129         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2130         /* Discard frame. */
2131         kfree_skb_reason(skb, drop_reason);
2132         return 0;
2133
2134 discard_and_relse:
2135         sk_drops_add(sk, skb);
2136         if (refcounted)
2137                 sock_put(sk);
2138         goto discard_it;
2139
2140 do_time_wait:
2141         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2142                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2143                 inet_twsk_put(inet_twsk(sk));
2144                 goto discard_it;
2145         }
2146
2147         tcp_v4_fill_cb(skb, iph, th);
2148
2149         if (tcp_checksum_complete(skb)) {
2150                 inet_twsk_put(inet_twsk(sk));
2151                 goto csum_error;
2152         }
2153         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2154         case TCP_TW_SYN: {
2155                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2156                                                         &tcp_hashinfo, skb,
2157                                                         __tcp_hdrlen(th),
2158                                                         iph->saddr, th->source,
2159                                                         iph->daddr, th->dest,
2160                                                         inet_iif(skb),
2161                                                         sdif);
2162                 if (sk2) {
2163                         inet_twsk_deschedule_put(inet_twsk(sk));
2164                         sk = sk2;
2165                         tcp_v4_restore_cb(skb);
2166                         refcounted = false;
2167                         goto process;
2168                 }
2169         }
2170                 /* to ACK */
2171                 fallthrough;
2172         case TCP_TW_ACK:
2173                 tcp_v4_timewait_ack(sk, skb);
2174                 break;
2175         case TCP_TW_RST:
2176                 tcp_v4_send_reset(sk, skb);
2177                 inet_twsk_deschedule_put(inet_twsk(sk));
2178                 goto discard_it;
2179         case TCP_TW_SUCCESS:;
2180         }
2181         goto discard_it;
2182 }
2183
2184 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2185         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2186         .twsk_unique    = tcp_twsk_unique,
2187         .twsk_destructor= tcp_twsk_destructor,
2188 };
2189
2190 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2191 {
2192         struct dst_entry *dst = skb_dst(skb);
2193
2194         if (dst && dst_hold_safe(dst)) {
2195                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2196                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2197         }
2198 }
2199 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2200
2201 const struct inet_connection_sock_af_ops ipv4_specific = {
2202         .queue_xmit        = ip_queue_xmit,
2203         .send_check        = tcp_v4_send_check,
2204         .rebuild_header    = inet_sk_rebuild_header,
2205         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2206         .conn_request      = tcp_v4_conn_request,
2207         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2208         .net_header_len    = sizeof(struct iphdr),
2209         .setsockopt        = ip_setsockopt,
2210         .getsockopt        = ip_getsockopt,
2211         .addr2sockaddr     = inet_csk_addr2sockaddr,
2212         .sockaddr_len      = sizeof(struct sockaddr_in),
2213         .mtu_reduced       = tcp_v4_mtu_reduced,
2214 };
2215 EXPORT_SYMBOL(ipv4_specific);
2216
2217 #ifdef CONFIG_TCP_MD5SIG
2218 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2219         .md5_lookup             = tcp_v4_md5_lookup,
2220         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2221         .md5_parse              = tcp_v4_parse_md5_keys,
2222 };
2223 #endif
2224
2225 /* NOTE: A lot of things set to zero explicitly by call to
2226  *       sk_alloc() so need not be done here.
2227  */
2228 static int tcp_v4_init_sock(struct sock *sk)
2229 {
2230         struct inet_connection_sock *icsk = inet_csk(sk);
2231
2232         tcp_init_sock(sk);
2233
2234         icsk->icsk_af_ops = &ipv4_specific;
2235
2236 #ifdef CONFIG_TCP_MD5SIG
2237         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2238 #endif
2239
2240         return 0;
2241 }
2242
2243 void tcp_v4_destroy_sock(struct sock *sk)
2244 {
2245         struct tcp_sock *tp = tcp_sk(sk);
2246
2247         trace_tcp_destroy_sock(sk);
2248
2249         tcp_clear_xmit_timers(sk);
2250
2251         tcp_cleanup_congestion_control(sk);
2252
2253         tcp_cleanup_ulp(sk);
2254
2255         /* Cleanup up the write buffer. */
2256         tcp_write_queue_purge(sk);
2257
2258         /* Check if we want to disable active TFO */
2259         tcp_fastopen_active_disable_ofo_check(sk);
2260
2261         /* Cleans up our, hopefully empty, out_of_order_queue. */
2262         skb_rbtree_purge(&tp->out_of_order_queue);
2263
2264 #ifdef CONFIG_TCP_MD5SIG
2265         /* Clean up the MD5 key list, if any */
2266         if (tp->md5sig_info) {
2267                 tcp_clear_md5_list(sk);
2268                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2269                 tp->md5sig_info = NULL;
2270         }
2271 #endif
2272
2273         /* Clean up a referenced TCP bind bucket. */
2274         if (inet_csk(sk)->icsk_bind_hash)
2275                 inet_put_port(sk);
2276
2277         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2278
2279         /* If socket is aborted during connect operation */
2280         tcp_free_fastopen_req(tp);
2281         tcp_fastopen_destroy_cipher(sk);
2282         tcp_saved_syn_free(tp);
2283
2284         sk_sockets_allocated_dec(sk);
2285 }
2286 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2287
2288 #ifdef CONFIG_PROC_FS
2289 /* Proc filesystem TCP sock list dumping. */
2290
2291 static unsigned short seq_file_family(const struct seq_file *seq);
2292
2293 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2294 {
2295         unsigned short family = seq_file_family(seq);
2296
2297         /* AF_UNSPEC is used as a match all */
2298         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2299                 net_eq(sock_net(sk), seq_file_net(seq)));
2300 }
2301
2302 /* Find a non empty bucket (starting from st->bucket)
2303  * and return the first sk from it.
2304  */
2305 static void *listening_get_first(struct seq_file *seq)
2306 {
2307         struct tcp_iter_state *st = seq->private;
2308
2309         st->offset = 0;
2310         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2311                 struct inet_listen_hashbucket *ilb2;
2312                 struct hlist_nulls_node *node;
2313                 struct sock *sk;
2314
2315                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2316                 if (hlist_nulls_empty(&ilb2->nulls_head))
2317                         continue;
2318
2319                 spin_lock(&ilb2->lock);
2320                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2321                         if (seq_sk_match(seq, sk))
2322                                 return sk;
2323                 }
2324                 spin_unlock(&ilb2->lock);
2325         }
2326
2327         return NULL;
2328 }
2329
2330 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2331  * If "cur" is the last one in the st->bucket,
2332  * call listening_get_first() to return the first sk of the next
2333  * non empty bucket.
2334  */
2335 static void *listening_get_next(struct seq_file *seq, void *cur)
2336 {
2337         struct tcp_iter_state *st = seq->private;
2338         struct inet_listen_hashbucket *ilb2;
2339         struct hlist_nulls_node *node;
2340         struct sock *sk = cur;
2341
2342         ++st->num;
2343         ++st->offset;
2344
2345         sk = sk_nulls_next(sk);
2346         sk_nulls_for_each_from(sk, node) {
2347                 if (seq_sk_match(seq, sk))
2348                         return sk;
2349         }
2350
2351         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2352         spin_unlock(&ilb2->lock);
2353         ++st->bucket;
2354         return listening_get_first(seq);
2355 }
2356
2357 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2358 {
2359         struct tcp_iter_state *st = seq->private;
2360         void *rc;
2361
2362         st->bucket = 0;
2363         st->offset = 0;
2364         rc = listening_get_first(seq);
2365
2366         while (rc && *pos) {
2367                 rc = listening_get_next(seq, rc);
2368                 --*pos;
2369         }
2370         return rc;
2371 }
2372
2373 static inline bool empty_bucket(const struct tcp_iter_state *st)
2374 {
2375         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2376 }
2377
2378 /*
2379  * Get first established socket starting from bucket given in st->bucket.
2380  * If st->bucket is zero, the very first socket in the hash is returned.
2381  */
2382 static void *established_get_first(struct seq_file *seq)
2383 {
2384         struct tcp_iter_state *st = seq->private;
2385
2386         st->offset = 0;
2387         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2388                 struct sock *sk;
2389                 struct hlist_nulls_node *node;
2390                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2391
2392                 /* Lockless fast path for the common case of empty buckets */
2393                 if (empty_bucket(st))
2394                         continue;
2395
2396                 spin_lock_bh(lock);
2397                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2398                         if (seq_sk_match(seq, sk))
2399                                 return sk;
2400                 }
2401                 spin_unlock_bh(lock);
2402         }
2403
2404         return NULL;
2405 }
2406
2407 static void *established_get_next(struct seq_file *seq, void *cur)
2408 {
2409         struct tcp_iter_state *st = seq->private;
2410         struct hlist_nulls_node *node;
2411         struct sock *sk = cur;
2412
2413         ++st->num;
2414         ++st->offset;
2415
2416         sk = sk_nulls_next(sk);
2417
2418         sk_nulls_for_each_from(sk, node) {
2419                 if (seq_sk_match(seq, sk))
2420                         return sk;
2421         }
2422
2423         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2424         ++st->bucket;
2425         return established_get_first(seq);
2426 }
2427
2428 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2429 {
2430         struct tcp_iter_state *st = seq->private;
2431         void *rc;
2432
2433         st->bucket = 0;
2434         rc = established_get_first(seq);
2435
2436         while (rc && pos) {
2437                 rc = established_get_next(seq, rc);
2438                 --pos;
2439         }
2440         return rc;
2441 }
2442
2443 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2444 {
2445         void *rc;
2446         struct tcp_iter_state *st = seq->private;
2447
2448         st->state = TCP_SEQ_STATE_LISTENING;
2449         rc        = listening_get_idx(seq, &pos);
2450
2451         if (!rc) {
2452                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2453                 rc        = established_get_idx(seq, pos);
2454         }
2455
2456         return rc;
2457 }
2458
2459 static void *tcp_seek_last_pos(struct seq_file *seq)
2460 {
2461         struct tcp_iter_state *st = seq->private;
2462         int bucket = st->bucket;
2463         int offset = st->offset;
2464         int orig_num = st->num;
2465         void *rc = NULL;
2466
2467         switch (st->state) {
2468         case TCP_SEQ_STATE_LISTENING:
2469                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2470                         break;
2471                 st->state = TCP_SEQ_STATE_LISTENING;
2472                 rc = listening_get_first(seq);
2473                 while (offset-- && rc && bucket == st->bucket)
2474                         rc = listening_get_next(seq, rc);
2475                 if (rc)
2476                         break;
2477                 st->bucket = 0;
2478                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2479                 fallthrough;
2480         case TCP_SEQ_STATE_ESTABLISHED:
2481                 if (st->bucket > tcp_hashinfo.ehash_mask)
2482                         break;
2483                 rc = established_get_first(seq);
2484                 while (offset-- && rc && bucket == st->bucket)
2485                         rc = established_get_next(seq, rc);
2486         }
2487
2488         st->num = orig_num;
2489
2490         return rc;
2491 }
2492
2493 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2494 {
2495         struct tcp_iter_state *st = seq->private;
2496         void *rc;
2497
2498         if (*pos && *pos == st->last_pos) {
2499                 rc = tcp_seek_last_pos(seq);
2500                 if (rc)
2501                         goto out;
2502         }
2503
2504         st->state = TCP_SEQ_STATE_LISTENING;
2505         st->num = 0;
2506         st->bucket = 0;
2507         st->offset = 0;
2508         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2509
2510 out:
2511         st->last_pos = *pos;
2512         return rc;
2513 }
2514 EXPORT_SYMBOL(tcp_seq_start);
2515
2516 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2517 {
2518         struct tcp_iter_state *st = seq->private;
2519         void *rc = NULL;
2520
2521         if (v == SEQ_START_TOKEN) {
2522                 rc = tcp_get_idx(seq, 0);
2523                 goto out;
2524         }
2525
2526         switch (st->state) {
2527         case TCP_SEQ_STATE_LISTENING:
2528                 rc = listening_get_next(seq, v);
2529                 if (!rc) {
2530                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2531                         st->bucket = 0;
2532                         st->offset = 0;
2533                         rc        = established_get_first(seq);
2534                 }
2535                 break;
2536         case TCP_SEQ_STATE_ESTABLISHED:
2537                 rc = established_get_next(seq, v);
2538                 break;
2539         }
2540 out:
2541         ++*pos;
2542         st->last_pos = *pos;
2543         return rc;
2544 }
2545 EXPORT_SYMBOL(tcp_seq_next);
2546
2547 void tcp_seq_stop(struct seq_file *seq, void *v)
2548 {
2549         struct tcp_iter_state *st = seq->private;
2550
2551         switch (st->state) {
2552         case TCP_SEQ_STATE_LISTENING:
2553                 if (v != SEQ_START_TOKEN)
2554                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2555                 break;
2556         case TCP_SEQ_STATE_ESTABLISHED:
2557                 if (v)
2558                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2559                 break;
2560         }
2561 }
2562 EXPORT_SYMBOL(tcp_seq_stop);
2563
2564 static void get_openreq4(const struct request_sock *req,
2565                          struct seq_file *f, int i)
2566 {
2567         const struct inet_request_sock *ireq = inet_rsk(req);
2568         long delta = req->rsk_timer.expires - jiffies;
2569
2570         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2571                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2572                 i,
2573                 ireq->ir_loc_addr,
2574                 ireq->ir_num,
2575                 ireq->ir_rmt_addr,
2576                 ntohs(ireq->ir_rmt_port),
2577                 TCP_SYN_RECV,
2578                 0, 0, /* could print option size, but that is af dependent. */
2579                 1,    /* timers active (only the expire timer) */
2580                 jiffies_delta_to_clock_t(delta),
2581                 req->num_timeout,
2582                 from_kuid_munged(seq_user_ns(f),
2583                                  sock_i_uid(req->rsk_listener)),
2584                 0,  /* non standard timer */
2585                 0, /* open_requests have no inode */
2586                 0,
2587                 req);
2588 }
2589
2590 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2591 {
2592         int timer_active;
2593         unsigned long timer_expires;
2594         const struct tcp_sock *tp = tcp_sk(sk);
2595         const struct inet_connection_sock *icsk = inet_csk(sk);
2596         const struct inet_sock *inet = inet_sk(sk);
2597         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2598         __be32 dest = inet->inet_daddr;
2599         __be32 src = inet->inet_rcv_saddr;
2600         __u16 destp = ntohs(inet->inet_dport);
2601         __u16 srcp = ntohs(inet->inet_sport);
2602         int rx_queue;
2603         int state;
2604
2605         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2606             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2607             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2608                 timer_active    = 1;
2609                 timer_expires   = icsk->icsk_timeout;
2610         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2611                 timer_active    = 4;
2612                 timer_expires   = icsk->icsk_timeout;
2613         } else if (timer_pending(&sk->sk_timer)) {
2614                 timer_active    = 2;
2615                 timer_expires   = sk->sk_timer.expires;
2616         } else {
2617                 timer_active    = 0;
2618                 timer_expires = jiffies;
2619         }
2620
2621         state = inet_sk_state_load(sk);
2622         if (state == TCP_LISTEN)
2623                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2624         else
2625                 /* Because we don't lock the socket,
2626                  * we might find a transient negative value.
2627                  */
2628                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2629                                       READ_ONCE(tp->copied_seq), 0);
2630
2631         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2632                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2633                 i, src, srcp, dest, destp, state,
2634                 READ_ONCE(tp->write_seq) - tp->snd_una,
2635                 rx_queue,
2636                 timer_active,
2637                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2638                 icsk->icsk_retransmits,
2639                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2640                 icsk->icsk_probes_out,
2641                 sock_i_ino(sk),
2642                 refcount_read(&sk->sk_refcnt), sk,
2643                 jiffies_to_clock_t(icsk->icsk_rto),
2644                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2645                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2646                 tcp_snd_cwnd(tp),
2647                 state == TCP_LISTEN ?
2648                     fastopenq->max_qlen :
2649                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2650 }
2651
2652 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2653                                struct seq_file *f, int i)
2654 {
2655         long delta = tw->tw_timer.expires - jiffies;
2656         __be32 dest, src;
2657         __u16 destp, srcp;
2658
2659         dest  = tw->tw_daddr;
2660         src   = tw->tw_rcv_saddr;
2661         destp = ntohs(tw->tw_dport);
2662         srcp  = ntohs(tw->tw_sport);
2663
2664         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2665                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2666                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2667                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2668                 refcount_read(&tw->tw_refcnt), tw);
2669 }
2670
2671 #define TMPSZ 150
2672
2673 static int tcp4_seq_show(struct seq_file *seq, void *v)
2674 {
2675         struct tcp_iter_state *st;
2676         struct sock *sk = v;
2677
2678         seq_setwidth(seq, TMPSZ - 1);
2679         if (v == SEQ_START_TOKEN) {
2680                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2681                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2682                            "inode");
2683                 goto out;
2684         }
2685         st = seq->private;
2686
2687         if (sk->sk_state == TCP_TIME_WAIT)
2688                 get_timewait4_sock(v, seq, st->num);
2689         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2690                 get_openreq4(v, seq, st->num);
2691         else
2692                 get_tcp4_sock(v, seq, st->num);
2693 out:
2694         seq_pad(seq, '\n');
2695         return 0;
2696 }
2697
2698 #ifdef CONFIG_BPF_SYSCALL
2699 struct bpf_tcp_iter_state {
2700         struct tcp_iter_state state;
2701         unsigned int cur_sk;
2702         unsigned int end_sk;
2703         unsigned int max_sk;
2704         struct sock **batch;
2705         bool st_bucket_done;
2706 };
2707
2708 struct bpf_iter__tcp {
2709         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2710         __bpf_md_ptr(struct sock_common *, sk_common);
2711         uid_t uid __aligned(8);
2712 };
2713
2714 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2715                              struct sock_common *sk_common, uid_t uid)
2716 {
2717         struct bpf_iter__tcp ctx;
2718
2719         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2720         ctx.meta = meta;
2721         ctx.sk_common = sk_common;
2722         ctx.uid = uid;
2723         return bpf_iter_run_prog(prog, &ctx);
2724 }
2725
2726 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2727 {
2728         while (iter->cur_sk < iter->end_sk)
2729                 sock_put(iter->batch[iter->cur_sk++]);
2730 }
2731
2732 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2733                                       unsigned int new_batch_sz)
2734 {
2735         struct sock **new_batch;
2736
2737         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2738                              GFP_USER | __GFP_NOWARN);
2739         if (!new_batch)
2740                 return -ENOMEM;
2741
2742         bpf_iter_tcp_put_batch(iter);
2743         kvfree(iter->batch);
2744         iter->batch = new_batch;
2745         iter->max_sk = new_batch_sz;
2746
2747         return 0;
2748 }
2749
2750 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2751                                                  struct sock *start_sk)
2752 {
2753         struct bpf_tcp_iter_state *iter = seq->private;
2754         struct tcp_iter_state *st = &iter->state;
2755         struct hlist_nulls_node *node;
2756         unsigned int expected = 1;
2757         struct sock *sk;
2758
2759         sock_hold(start_sk);
2760         iter->batch[iter->end_sk++] = start_sk;
2761
2762         sk = sk_nulls_next(start_sk);
2763         sk_nulls_for_each_from(sk, node) {
2764                 if (seq_sk_match(seq, sk)) {
2765                         if (iter->end_sk < iter->max_sk) {
2766                                 sock_hold(sk);
2767                                 iter->batch[iter->end_sk++] = sk;
2768                         }
2769                         expected++;
2770                 }
2771         }
2772         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2773
2774         return expected;
2775 }
2776
2777 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2778                                                    struct sock *start_sk)
2779 {
2780         struct bpf_tcp_iter_state *iter = seq->private;
2781         struct tcp_iter_state *st = &iter->state;
2782         struct hlist_nulls_node *node;
2783         unsigned int expected = 1;
2784         struct sock *sk;
2785
2786         sock_hold(start_sk);
2787         iter->batch[iter->end_sk++] = start_sk;
2788
2789         sk = sk_nulls_next(start_sk);
2790         sk_nulls_for_each_from(sk, node) {
2791                 if (seq_sk_match(seq, sk)) {
2792                         if (iter->end_sk < iter->max_sk) {
2793                                 sock_hold(sk);
2794                                 iter->batch[iter->end_sk++] = sk;
2795                         }
2796                         expected++;
2797                 }
2798         }
2799         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2800
2801         return expected;
2802 }
2803
2804 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2805 {
2806         struct bpf_tcp_iter_state *iter = seq->private;
2807         struct tcp_iter_state *st = &iter->state;
2808         unsigned int expected;
2809         bool resized = false;
2810         struct sock *sk;
2811
2812         /* The st->bucket is done.  Directly advance to the next
2813          * bucket instead of having the tcp_seek_last_pos() to skip
2814          * one by one in the current bucket and eventually find out
2815          * it has to advance to the next bucket.
2816          */
2817         if (iter->st_bucket_done) {
2818                 st->offset = 0;
2819                 st->bucket++;
2820                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2821                     st->bucket > tcp_hashinfo.lhash2_mask) {
2822                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2823                         st->bucket = 0;
2824                 }
2825         }
2826
2827 again:
2828         /* Get a new batch */
2829         iter->cur_sk = 0;
2830         iter->end_sk = 0;
2831         iter->st_bucket_done = false;
2832
2833         sk = tcp_seek_last_pos(seq);
2834         if (!sk)
2835                 return NULL; /* Done */
2836
2837         if (st->state == TCP_SEQ_STATE_LISTENING)
2838                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2839         else
2840                 expected = bpf_iter_tcp_established_batch(seq, sk);
2841
2842         if (iter->end_sk == expected) {
2843                 iter->st_bucket_done = true;
2844                 return sk;
2845         }
2846
2847         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2848                 resized = true;
2849                 goto again;
2850         }
2851
2852         return sk;
2853 }
2854
2855 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2856 {
2857         /* bpf iter does not support lseek, so it always
2858          * continue from where it was stop()-ped.
2859          */
2860         if (*pos)
2861                 return bpf_iter_tcp_batch(seq);
2862
2863         return SEQ_START_TOKEN;
2864 }
2865
2866 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2867 {
2868         struct bpf_tcp_iter_state *iter = seq->private;
2869         struct tcp_iter_state *st = &iter->state;
2870         struct sock *sk;
2871
2872         /* Whenever seq_next() is called, the iter->cur_sk is
2873          * done with seq_show(), so advance to the next sk in
2874          * the batch.
2875          */
2876         if (iter->cur_sk < iter->end_sk) {
2877                 /* Keeping st->num consistent in tcp_iter_state.
2878                  * bpf_iter_tcp does not use st->num.
2879                  * meta.seq_num is used instead.
2880                  */
2881                 st->num++;
2882                 /* Move st->offset to the next sk in the bucket such that
2883                  * the future start() will resume at st->offset in
2884                  * st->bucket.  See tcp_seek_last_pos().
2885                  */
2886                 st->offset++;
2887                 sock_put(iter->batch[iter->cur_sk++]);
2888         }
2889
2890         if (iter->cur_sk < iter->end_sk)
2891                 sk = iter->batch[iter->cur_sk];
2892         else
2893                 sk = bpf_iter_tcp_batch(seq);
2894
2895         ++*pos;
2896         /* Keeping st->last_pos consistent in tcp_iter_state.
2897          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2898          */
2899         st->last_pos = *pos;
2900         return sk;
2901 }
2902
2903 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2904 {
2905         struct bpf_iter_meta meta;
2906         struct bpf_prog *prog;
2907         struct sock *sk = v;
2908         bool slow;
2909         uid_t uid;
2910         int ret;
2911
2912         if (v == SEQ_START_TOKEN)
2913                 return 0;
2914
2915         if (sk_fullsock(sk))
2916                 slow = lock_sock_fast(sk);
2917
2918         if (unlikely(sk_unhashed(sk))) {
2919                 ret = SEQ_SKIP;
2920                 goto unlock;
2921         }
2922
2923         if (sk->sk_state == TCP_TIME_WAIT) {
2924                 uid = 0;
2925         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2926                 const struct request_sock *req = v;
2927
2928                 uid = from_kuid_munged(seq_user_ns(seq),
2929                                        sock_i_uid(req->rsk_listener));
2930         } else {
2931                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2932         }
2933
2934         meta.seq = seq;
2935         prog = bpf_iter_get_info(&meta, false);
2936         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2937
2938 unlock:
2939         if (sk_fullsock(sk))
2940                 unlock_sock_fast(sk, slow);
2941         return ret;
2942
2943 }
2944
2945 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2946 {
2947         struct bpf_tcp_iter_state *iter = seq->private;
2948         struct bpf_iter_meta meta;
2949         struct bpf_prog *prog;
2950
2951         if (!v) {
2952                 meta.seq = seq;
2953                 prog = bpf_iter_get_info(&meta, true);
2954                 if (prog)
2955                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2956         }
2957
2958         if (iter->cur_sk < iter->end_sk) {
2959                 bpf_iter_tcp_put_batch(iter);
2960                 iter->st_bucket_done = false;
2961         }
2962 }
2963
2964 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2965         .show           = bpf_iter_tcp_seq_show,
2966         .start          = bpf_iter_tcp_seq_start,
2967         .next           = bpf_iter_tcp_seq_next,
2968         .stop           = bpf_iter_tcp_seq_stop,
2969 };
2970 #endif
2971 static unsigned short seq_file_family(const struct seq_file *seq)
2972 {
2973         const struct tcp_seq_afinfo *afinfo;
2974
2975 #ifdef CONFIG_BPF_SYSCALL
2976         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2977         if (seq->op == &bpf_iter_tcp_seq_ops)
2978                 return AF_UNSPEC;
2979 #endif
2980
2981         /* Iterated from proc fs */
2982         afinfo = pde_data(file_inode(seq->file));
2983         return afinfo->family;
2984 }
2985
2986 static const struct seq_operations tcp4_seq_ops = {
2987         .show           = tcp4_seq_show,
2988         .start          = tcp_seq_start,
2989         .next           = tcp_seq_next,
2990         .stop           = tcp_seq_stop,
2991 };
2992
2993 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2994         .family         = AF_INET,
2995 };
2996
2997 static int __net_init tcp4_proc_init_net(struct net *net)
2998 {
2999         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3000                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3001                 return -ENOMEM;
3002         return 0;
3003 }
3004
3005 static void __net_exit tcp4_proc_exit_net(struct net *net)
3006 {
3007         remove_proc_entry("tcp", net->proc_net);
3008 }
3009
3010 static struct pernet_operations tcp4_net_ops = {
3011         .init = tcp4_proc_init_net,
3012         .exit = tcp4_proc_exit_net,
3013 };
3014
3015 int __init tcp4_proc_init(void)
3016 {
3017         return register_pernet_subsys(&tcp4_net_ops);
3018 }
3019
3020 void tcp4_proc_exit(void)
3021 {
3022         unregister_pernet_subsys(&tcp4_net_ops);
3023 }
3024 #endif /* CONFIG_PROC_FS */
3025
3026 /* @wake is one when sk_stream_write_space() calls us.
3027  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3028  * This mimics the strategy used in sock_def_write_space().
3029  */
3030 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3031 {
3032         const struct tcp_sock *tp = tcp_sk(sk);
3033         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3034                             READ_ONCE(tp->snd_nxt);
3035
3036         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3037 }
3038 EXPORT_SYMBOL(tcp_stream_memory_free);
3039
3040 struct proto tcp_prot = {
3041         .name                   = "TCP",
3042         .owner                  = THIS_MODULE,
3043         .close                  = tcp_close,
3044         .pre_connect            = tcp_v4_pre_connect,
3045         .connect                = tcp_v4_connect,
3046         .disconnect             = tcp_disconnect,
3047         .accept                 = inet_csk_accept,
3048         .ioctl                  = tcp_ioctl,
3049         .init                   = tcp_v4_init_sock,
3050         .destroy                = tcp_v4_destroy_sock,
3051         .shutdown               = tcp_shutdown,
3052         .setsockopt             = tcp_setsockopt,
3053         .getsockopt             = tcp_getsockopt,
3054         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3055         .keepalive              = tcp_set_keepalive,
3056         .recvmsg                = tcp_recvmsg,
3057         .sendmsg                = tcp_sendmsg,
3058         .sendpage               = tcp_sendpage,
3059         .backlog_rcv            = tcp_v4_do_rcv,
3060         .release_cb             = tcp_release_cb,
3061         .hash                   = inet_hash,
3062         .unhash                 = inet_unhash,
3063         .get_port               = inet_csk_get_port,
3064         .put_port               = inet_put_port,
3065 #ifdef CONFIG_BPF_SYSCALL
3066         .psock_update_sk_prot   = tcp_bpf_update_proto,
3067 #endif
3068         .enter_memory_pressure  = tcp_enter_memory_pressure,
3069         .leave_memory_pressure  = tcp_leave_memory_pressure,
3070         .stream_memory_free     = tcp_stream_memory_free,
3071         .sockets_allocated      = &tcp_sockets_allocated,
3072         .orphan_count           = &tcp_orphan_count,
3073
3074         .memory_allocated       = &tcp_memory_allocated,
3075         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3076
3077         .memory_pressure        = &tcp_memory_pressure,
3078         .sysctl_mem             = sysctl_tcp_mem,
3079         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3080         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3081         .max_header             = MAX_TCP_HEADER,
3082         .obj_size               = sizeof(struct tcp_sock),
3083         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3084         .twsk_prot              = &tcp_timewait_sock_ops,
3085         .rsk_prot               = &tcp_request_sock_ops,
3086         .h.hashinfo             = &tcp_hashinfo,
3087         .no_autobind            = true,
3088         .diag_destroy           = tcp_abort,
3089 };
3090 EXPORT_SYMBOL(tcp_prot);
3091
3092 static void __net_exit tcp_sk_exit(struct net *net)
3093 {
3094         struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3095
3096         if (net->ipv4.tcp_congestion_control)
3097                 bpf_module_put(net->ipv4.tcp_congestion_control,
3098                                net->ipv4.tcp_congestion_control->owner);
3099         if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3100                 kfree(tcp_death_row);
3101 }
3102
3103 static int __net_init tcp_sk_init(struct net *net)
3104 {
3105         int cnt;
3106
3107         net->ipv4.sysctl_tcp_ecn = 2;
3108         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3109
3110         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3111         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3112         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3113         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3114         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3115
3116         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3117         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3118         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3119
3120         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3121         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3122         net->ipv4.sysctl_tcp_syncookies = 1;
3123         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3124         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3125         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3126         net->ipv4.sysctl_tcp_orphan_retries = 0;
3127         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3128         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3129         net->ipv4.sysctl_tcp_tw_reuse = 2;
3130         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3131
3132         net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3133         if (!net->ipv4.tcp_death_row)
3134                 return -ENOMEM;
3135         refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3136         cnt = tcp_hashinfo.ehash_mask + 1;
3137         net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3138         net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3139
3140         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3141         net->ipv4.sysctl_tcp_sack = 1;
3142         net->ipv4.sysctl_tcp_window_scaling = 1;
3143         net->ipv4.sysctl_tcp_timestamps = 1;
3144         net->ipv4.sysctl_tcp_early_retrans = 3;
3145         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3146         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3147         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3148         net->ipv4.sysctl_tcp_max_reordering = 300;
3149         net->ipv4.sysctl_tcp_dsack = 1;
3150         net->ipv4.sysctl_tcp_app_win = 31;
3151         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3152         net->ipv4.sysctl_tcp_frto = 2;
3153         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3154         /* This limits the percentage of the congestion window which we
3155          * will allow a single TSO frame to consume.  Building TSO frames
3156          * which are too large can cause TCP streams to be bursty.
3157          */
3158         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3159         /* Default TSQ limit of 16 TSO segments */
3160         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3161
3162         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3163         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3164
3165         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3166         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3167         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3168         net->ipv4.sysctl_tcp_autocorking = 1;
3169         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3170         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3171         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3172         if (net != &init_net) {
3173                 memcpy(net->ipv4.sysctl_tcp_rmem,
3174                        init_net.ipv4.sysctl_tcp_rmem,
3175                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3176                 memcpy(net->ipv4.sysctl_tcp_wmem,
3177                        init_net.ipv4.sysctl_tcp_wmem,
3178                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3179         }
3180         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3181         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3182         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3183         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3184         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3185         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3186
3187         /* Reno is always built in */
3188         if (!net_eq(net, &init_net) &&
3189             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3190                                init_net.ipv4.tcp_congestion_control->owner))
3191                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3192         else
3193                 net->ipv4.tcp_congestion_control = &tcp_reno;
3194
3195         return 0;
3196 }
3197
3198 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3199 {
3200         struct net *net;
3201
3202         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3203
3204         list_for_each_entry(net, net_exit_list, exit_list)
3205                 tcp_fastopen_ctx_destroy(net);
3206 }
3207
3208 static struct pernet_operations __net_initdata tcp_sk_ops = {
3209        .init       = tcp_sk_init,
3210        .exit       = tcp_sk_exit,
3211        .exit_batch = tcp_sk_exit_batch,
3212 };
3213
3214 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3215 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3216                      struct sock_common *sk_common, uid_t uid)
3217
3218 #define INIT_BATCH_SZ 16
3219
3220 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3221 {
3222         struct bpf_tcp_iter_state *iter = priv_data;
3223         int err;
3224
3225         err = bpf_iter_init_seq_net(priv_data, aux);
3226         if (err)
3227                 return err;
3228
3229         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3230         if (err) {
3231                 bpf_iter_fini_seq_net(priv_data);
3232                 return err;
3233         }
3234
3235         return 0;
3236 }
3237
3238 static void bpf_iter_fini_tcp(void *priv_data)
3239 {
3240         struct bpf_tcp_iter_state *iter = priv_data;
3241
3242         bpf_iter_fini_seq_net(priv_data);
3243         kvfree(iter->batch);
3244 }
3245
3246 static const struct bpf_iter_seq_info tcp_seq_info = {
3247         .seq_ops                = &bpf_iter_tcp_seq_ops,
3248         .init_seq_private       = bpf_iter_init_tcp,
3249         .fini_seq_private       = bpf_iter_fini_tcp,
3250         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3251 };
3252
3253 static const struct bpf_func_proto *
3254 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3255                             const struct bpf_prog *prog)
3256 {
3257         switch (func_id) {
3258         case BPF_FUNC_setsockopt:
3259                 return &bpf_sk_setsockopt_proto;
3260         case BPF_FUNC_getsockopt:
3261                 return &bpf_sk_getsockopt_proto;
3262         default:
3263                 return NULL;
3264         }
3265 }
3266
3267 static struct bpf_iter_reg tcp_reg_info = {
3268         .target                 = "tcp",
3269         .ctx_arg_info_size      = 1,
3270         .ctx_arg_info           = {
3271                 { offsetof(struct bpf_iter__tcp, sk_common),
3272                   PTR_TO_BTF_ID_OR_NULL },
3273         },
3274         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3275         .seq_info               = &tcp_seq_info,
3276 };
3277
3278 static void __init bpf_iter_register(void)
3279 {
3280         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3281         if (bpf_iter_reg_target(&tcp_reg_info))
3282                 pr_warn("Warning: could not register bpf iterator tcp\n");
3283 }
3284
3285 #endif
3286
3287 void __init tcp_v4_init(void)
3288 {
3289         int cpu, res;
3290
3291         for_each_possible_cpu(cpu) {
3292                 struct sock *sk;
3293
3294                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3295                                            IPPROTO_TCP, &init_net);
3296                 if (res)
3297                         panic("Failed to create the TCP control socket.\n");
3298                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3299
3300                 /* Please enforce IP_DF and IPID==0 for RST and
3301                  * ACK sent in SYN-RECV and TIME-WAIT state.
3302                  */
3303                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3304
3305                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3306         }
3307         if (register_pernet_subsys(&tcp_sk_ops))
3308                 panic("Failed to create the TCP control socket.\n");
3309
3310 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3311         bpf_iter_register();
3312 #endif
3313 }