ping: Fix potentail NULL deref for /proc/net/icmp.
[platform/kernel/linux-starfive.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98         return secure_tcp_seq(ip_hdr(skb)->daddr,
99                               ip_hdr(skb)->saddr,
100                               tcp_hdr(skb)->dest,
101                               tcp_hdr(skb)->source);
102 }
103
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         if (reuse == 2) {
117                 /* Still does not detect *everything* that goes through
118                  * lo, since we require a loopback src or dst address
119                  * or direct binding to 'lo' interface.
120                  */
121                 bool loopback = false;
122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123                         loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125                 if (tw->tw_family == AF_INET6) {
126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130                                 loopback = true;
131                 } else
132 #endif
133                 {
134                         if (ipv4_is_loopback(tw->tw_daddr) ||
135                             ipv4_is_loopback(tw->tw_rcv_saddr))
136                                 loopback = true;
137                 }
138                 if (!loopback)
139                         reuse = 0;
140         }
141
142         /* With PAWS, it is safe from the viewpoint
143            of data integrity. Even without PAWS it is safe provided sequence
144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146            Actually, the idea is close to VJ's one, only timestamp cache is
147            held not per host, but per port pair and TW bucket is used as state
148            holder.
149
150            If TW bucket has been already destroyed we fall back to VJ's scheme
151            and use initial timestamp retrieved from peer table.
152          */
153         if (tcptw->tw_ts_recent_stamp &&
154             (!twp || (reuse && time_after32(ktime_get_seconds(),
155                                             tcptw->tw_ts_recent_stamp)))) {
156                 /* In case of repair and re-using TIME-WAIT sockets we still
157                  * want to be sure that it is safe as above but honor the
158                  * sequence numbers and time stamps set as part of the repair
159                  * process.
160                  *
161                  * Without this check re-using a TIME-WAIT socket with TCP
162                  * repair would accumulate a -1 on the repair assigned
163                  * sequence number. The first time it is reused the sequence
164                  * is -1, the second time -2, etc. This fixes that issue
165                  * without appearing to create any others.
166                  */
167                 if (likely(!tp->repair)) {
168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170                         if (!seq)
171                                 seq = 1;
172                         WRITE_ONCE(tp->write_seq, seq);
173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175                 }
176                 sock_hold(sktw);
177                 return 1;
178         }
179
180         return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185                               int addr_len)
186 {
187         /* This check is replicated from tcp_v4_connect() and intended to
188          * prevent BPF program called below from accessing bytes that are out
189          * of the bound specified by user in addr_len.
190          */
191         if (addr_len < sizeof(struct sockaddr_in))
192                 return -EINVAL;
193
194         sock_owned_by_me(sk);
195
196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203         struct inet_timewait_death_row *tcp_death_row;
204         struct inet_sock *inet = inet_sk(sk);
205         struct tcp_sock *tp = tcp_sk(sk);
206         struct ip_options_rcu *inet_opt;
207         struct net *net = sock_net(sk);
208         __be16 orig_sport, orig_dport;
209         __be32 daddr, nexthop;
210         struct flowi4 *fl4;
211         struct rtable *rt;
212         int err;
213
214         if (addr_len < sizeof(struct sockaddr_in))
215                 return -EINVAL;
216
217         if (usin->sin_family != AF_INET)
218                 return -EAFNOSUPPORT;
219
220         nexthop = daddr = usin->sin_addr.s_addr;
221         inet_opt = rcu_dereference_protected(inet->inet_opt,
222                                              lockdep_sock_is_held(sk));
223         if (inet_opt && inet_opt->opt.srr) {
224                 if (!daddr)
225                         return -EINVAL;
226                 nexthop = inet_opt->opt.faddr;
227         }
228
229         orig_sport = inet->inet_sport;
230         orig_dport = usin->sin_port;
231         fl4 = &inet->cork.fl.u.ip4;
232         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
234                               orig_dport, sk);
235         if (IS_ERR(rt)) {
236                 err = PTR_ERR(rt);
237                 if (err == -ENETUNREACH)
238                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
239                 return err;
240         }
241
242         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243                 ip_rt_put(rt);
244                 return -ENETUNREACH;
245         }
246
247         if (!inet_opt || !inet_opt->opt.srr)
248                 daddr = fl4->daddr;
249
250         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
251
252         if (!inet->inet_saddr) {
253                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
254                 if (err) {
255                         ip_rt_put(rt);
256                         return err;
257                 }
258         } else {
259                 sk_rcv_saddr_set(sk, inet->inet_saddr);
260         }
261
262         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
263                 /* Reset inherited state */
264                 tp->rx_opt.ts_recent       = 0;
265                 tp->rx_opt.ts_recent_stamp = 0;
266                 if (likely(!tp->repair))
267                         WRITE_ONCE(tp->write_seq, 0);
268         }
269
270         inet->inet_dport = usin->sin_port;
271         sk_daddr_set(sk, daddr);
272
273         inet_csk(sk)->icsk_ext_hdr_len = 0;
274         if (inet_opt)
275                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
276
277         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
278
279         /* Socket identity is still unknown (sport may be zero).
280          * However we set state to SYN-SENT and not releasing socket
281          * lock select source port, enter ourselves into the hash tables and
282          * complete initialization after this.
283          */
284         tcp_set_state(sk, TCP_SYN_SENT);
285         err = inet_hash_connect(tcp_death_row, sk);
286         if (err)
287                 goto failure;
288
289         sk_set_txhash(sk);
290
291         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
292                                inet->inet_sport, inet->inet_dport, sk);
293         if (IS_ERR(rt)) {
294                 err = PTR_ERR(rt);
295                 rt = NULL;
296                 goto failure;
297         }
298         /* OK, now commit destination to socket.  */
299         sk->sk_gso_type = SKB_GSO_TCPV4;
300         sk_setup_caps(sk, &rt->dst);
301         rt = NULL;
302
303         if (likely(!tp->repair)) {
304                 if (!tp->write_seq)
305                         WRITE_ONCE(tp->write_seq,
306                                    secure_tcp_seq(inet->inet_saddr,
307                                                   inet->inet_daddr,
308                                                   inet->inet_sport,
309                                                   usin->sin_port));
310                 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
311                                                  inet->inet_daddr);
312         }
313
314         inet->inet_id = get_random_u16();
315
316         if (tcp_fastopen_defer_connect(sk, &err))
317                 return err;
318         if (err)
319                 goto failure;
320
321         err = tcp_connect(sk);
322
323         if (err)
324                 goto failure;
325
326         return 0;
327
328 failure:
329         /*
330          * This unhashes the socket and releases the local port,
331          * if necessary.
332          */
333         tcp_set_state(sk, TCP_CLOSE);
334         inet_bhash2_reset_saddr(sk);
335         ip_rt_put(rt);
336         sk->sk_route_caps = 0;
337         inet->inet_dport = 0;
338         return err;
339 }
340 EXPORT_SYMBOL(tcp_v4_connect);
341
342 /*
343  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
344  * It can be called through tcp_release_cb() if socket was owned by user
345  * at the time tcp_v4_err() was called to handle ICMP message.
346  */
347 void tcp_v4_mtu_reduced(struct sock *sk)
348 {
349         struct inet_sock *inet = inet_sk(sk);
350         struct dst_entry *dst;
351         u32 mtu;
352
353         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
354                 return;
355         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
356         dst = inet_csk_update_pmtu(sk, mtu);
357         if (!dst)
358                 return;
359
360         /* Something is about to be wrong... Remember soft error
361          * for the case, if this connection will not able to recover.
362          */
363         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
364                 sk->sk_err_soft = EMSGSIZE;
365
366         mtu = dst_mtu(dst);
367
368         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
369             ip_sk_accept_pmtu(sk) &&
370             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
371                 tcp_sync_mss(sk, mtu);
372
373                 /* Resend the TCP packet because it's
374                  * clear that the old packet has been
375                  * dropped. This is the new "fast" path mtu
376                  * discovery.
377                  */
378                 tcp_simple_retransmit(sk);
379         } /* else let the usual retransmit timer handle it */
380 }
381 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
382
383 static void do_redirect(struct sk_buff *skb, struct sock *sk)
384 {
385         struct dst_entry *dst = __sk_dst_check(sk, 0);
386
387         if (dst)
388                 dst->ops->redirect(dst, sk, skb);
389 }
390
391
392 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
393 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
394 {
395         struct request_sock *req = inet_reqsk(sk);
396         struct net *net = sock_net(sk);
397
398         /* ICMPs are not backlogged, hence we cannot get
399          * an established socket here.
400          */
401         if (seq != tcp_rsk(req)->snt_isn) {
402                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
403         } else if (abort) {
404                 /*
405                  * Still in SYN_RECV, just remove it silently.
406                  * There is no good way to pass the error to the newly
407                  * created socket, and POSIX does not want network
408                  * errors returned from accept().
409                  */
410                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
411                 tcp_listendrop(req->rsk_listener);
412         }
413         reqsk_put(req);
414 }
415 EXPORT_SYMBOL(tcp_req_err);
416
417 /* TCP-LD (RFC 6069) logic */
418 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
419 {
420         struct inet_connection_sock *icsk = inet_csk(sk);
421         struct tcp_sock *tp = tcp_sk(sk);
422         struct sk_buff *skb;
423         s32 remaining;
424         u32 delta_us;
425
426         if (sock_owned_by_user(sk))
427                 return;
428
429         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
430             !icsk->icsk_backoff)
431                 return;
432
433         skb = tcp_rtx_queue_head(sk);
434         if (WARN_ON_ONCE(!skb))
435                 return;
436
437         icsk->icsk_backoff--;
438         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
439         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
440
441         tcp_mstamp_refresh(tp);
442         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
443         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
444
445         if (remaining > 0) {
446                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
447                                           remaining, TCP_RTO_MAX);
448         } else {
449                 /* RTO revert clocked out retransmission.
450                  * Will retransmit now.
451                  */
452                 tcp_retransmit_timer(sk);
453         }
454 }
455 EXPORT_SYMBOL(tcp_ld_RTO_revert);
456
457 /*
458  * This routine is called by the ICMP module when it gets some
459  * sort of error condition.  If err < 0 then the socket should
460  * be closed and the error returned to the user.  If err > 0
461  * it's just the icmp type << 8 | icmp code.  After adjustment
462  * header points to the first 8 bytes of the tcp header.  We need
463  * to find the appropriate port.
464  *
465  * The locking strategy used here is very "optimistic". When
466  * someone else accesses the socket the ICMP is just dropped
467  * and for some paths there is no check at all.
468  * A more general error queue to queue errors for later handling
469  * is probably better.
470  *
471  */
472
473 int tcp_v4_err(struct sk_buff *skb, u32 info)
474 {
475         const struct iphdr *iph = (const struct iphdr *)skb->data;
476         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
477         struct tcp_sock *tp;
478         struct inet_sock *inet;
479         const int type = icmp_hdr(skb)->type;
480         const int code = icmp_hdr(skb)->code;
481         struct sock *sk;
482         struct request_sock *fastopen;
483         u32 seq, snd_una;
484         int err;
485         struct net *net = dev_net(skb->dev);
486
487         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
488                                        iph->daddr, th->dest, iph->saddr,
489                                        ntohs(th->source), inet_iif(skb), 0);
490         if (!sk) {
491                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
492                 return -ENOENT;
493         }
494         if (sk->sk_state == TCP_TIME_WAIT) {
495                 inet_twsk_put(inet_twsk(sk));
496                 return 0;
497         }
498         seq = ntohl(th->seq);
499         if (sk->sk_state == TCP_NEW_SYN_RECV) {
500                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
501                                      type == ICMP_TIME_EXCEEDED ||
502                                      (type == ICMP_DEST_UNREACH &&
503                                       (code == ICMP_NET_UNREACH ||
504                                        code == ICMP_HOST_UNREACH)));
505                 return 0;
506         }
507
508         bh_lock_sock(sk);
509         /* If too many ICMPs get dropped on busy
510          * servers this needs to be solved differently.
511          * We do take care of PMTU discovery (RFC1191) special case :
512          * we can receive locally generated ICMP messages while socket is held.
513          */
514         if (sock_owned_by_user(sk)) {
515                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
516                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
517         }
518         if (sk->sk_state == TCP_CLOSE)
519                 goto out;
520
521         if (static_branch_unlikely(&ip4_min_ttl)) {
522                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
523                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
524                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
525                         goto out;
526                 }
527         }
528
529         tp = tcp_sk(sk);
530         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
531         fastopen = rcu_dereference(tp->fastopen_rsk);
532         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
533         if (sk->sk_state != TCP_LISTEN &&
534             !between(seq, snd_una, tp->snd_nxt)) {
535                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
536                 goto out;
537         }
538
539         switch (type) {
540         case ICMP_REDIRECT:
541                 if (!sock_owned_by_user(sk))
542                         do_redirect(skb, sk);
543                 goto out;
544         case ICMP_SOURCE_QUENCH:
545                 /* Just silently ignore these. */
546                 goto out;
547         case ICMP_PARAMETERPROB:
548                 err = EPROTO;
549                 break;
550         case ICMP_DEST_UNREACH:
551                 if (code > NR_ICMP_UNREACH)
552                         goto out;
553
554                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
555                         /* We are not interested in TCP_LISTEN and open_requests
556                          * (SYN-ACKs send out by Linux are always <576bytes so
557                          * they should go through unfragmented).
558                          */
559                         if (sk->sk_state == TCP_LISTEN)
560                                 goto out;
561
562                         WRITE_ONCE(tp->mtu_info, info);
563                         if (!sock_owned_by_user(sk)) {
564                                 tcp_v4_mtu_reduced(sk);
565                         } else {
566                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
567                                         sock_hold(sk);
568                         }
569                         goto out;
570                 }
571
572                 err = icmp_err_convert[code].errno;
573                 /* check if this ICMP message allows revert of backoff.
574                  * (see RFC 6069)
575                  */
576                 if (!fastopen &&
577                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
578                         tcp_ld_RTO_revert(sk, seq);
579                 break;
580         case ICMP_TIME_EXCEEDED:
581                 err = EHOSTUNREACH;
582                 break;
583         default:
584                 goto out;
585         }
586
587         switch (sk->sk_state) {
588         case TCP_SYN_SENT:
589         case TCP_SYN_RECV:
590                 /* Only in fast or simultaneous open. If a fast open socket is
591                  * already accepted it is treated as a connected one below.
592                  */
593                 if (fastopen && !fastopen->sk)
594                         break;
595
596                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
597
598                 if (!sock_owned_by_user(sk)) {
599                         sk->sk_err = err;
600
601                         sk_error_report(sk);
602
603                         tcp_done(sk);
604                 } else {
605                         sk->sk_err_soft = err;
606                 }
607                 goto out;
608         }
609
610         /* If we've already connected we will keep trying
611          * until we time out, or the user gives up.
612          *
613          * rfc1122 4.2.3.9 allows to consider as hard errors
614          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
615          * but it is obsoleted by pmtu discovery).
616          *
617          * Note, that in modern internet, where routing is unreliable
618          * and in each dark corner broken firewalls sit, sending random
619          * errors ordered by their masters even this two messages finally lose
620          * their original sense (even Linux sends invalid PORT_UNREACHs)
621          *
622          * Now we are in compliance with RFCs.
623          *                                                      --ANK (980905)
624          */
625
626         inet = inet_sk(sk);
627         if (!sock_owned_by_user(sk) && inet->recverr) {
628                 sk->sk_err = err;
629                 sk_error_report(sk);
630         } else  { /* Only an error on timeout */
631                 sk->sk_err_soft = err;
632         }
633
634 out:
635         bh_unlock_sock(sk);
636         sock_put(sk);
637         return 0;
638 }
639
640 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
641 {
642         struct tcphdr *th = tcp_hdr(skb);
643
644         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
645         skb->csum_start = skb_transport_header(skb) - skb->head;
646         skb->csum_offset = offsetof(struct tcphdr, check);
647 }
648
649 /* This routine computes an IPv4 TCP checksum. */
650 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
651 {
652         const struct inet_sock *inet = inet_sk(sk);
653
654         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
655 }
656 EXPORT_SYMBOL(tcp_v4_send_check);
657
658 /*
659  *      This routine will send an RST to the other tcp.
660  *
661  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
662  *                    for reset.
663  *      Answer: if a packet caused RST, it is not for a socket
664  *              existing in our system, if it is matched to a socket,
665  *              it is just duplicate segment or bug in other side's TCP.
666  *              So that we build reply only basing on parameters
667  *              arrived with segment.
668  *      Exception: precedence violation. We do not implement it in any case.
669  */
670
671 #ifdef CONFIG_TCP_MD5SIG
672 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
673 #else
674 #define OPTION_BYTES sizeof(__be32)
675 #endif
676
677 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
678 {
679         const struct tcphdr *th = tcp_hdr(skb);
680         struct {
681                 struct tcphdr th;
682                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
683         } rep;
684         struct ip_reply_arg arg;
685 #ifdef CONFIG_TCP_MD5SIG
686         struct tcp_md5sig_key *key = NULL;
687         const __u8 *hash_location = NULL;
688         unsigned char newhash[16];
689         int genhash;
690         struct sock *sk1 = NULL;
691 #endif
692         u64 transmit_time = 0;
693         struct sock *ctl_sk;
694         struct net *net;
695
696         /* Never send a reset in response to a reset. */
697         if (th->rst)
698                 return;
699
700         /* If sk not NULL, it means we did a successful lookup and incoming
701          * route had to be correct. prequeue might have dropped our dst.
702          */
703         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
704                 return;
705
706         /* Swap the send and the receive. */
707         memset(&rep, 0, sizeof(rep));
708         rep.th.dest   = th->source;
709         rep.th.source = th->dest;
710         rep.th.doff   = sizeof(struct tcphdr) / 4;
711         rep.th.rst    = 1;
712
713         if (th->ack) {
714                 rep.th.seq = th->ack_seq;
715         } else {
716                 rep.th.ack = 1;
717                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
718                                        skb->len - (th->doff << 2));
719         }
720
721         memset(&arg, 0, sizeof(arg));
722         arg.iov[0].iov_base = (unsigned char *)&rep;
723         arg.iov[0].iov_len  = sizeof(rep.th);
724
725         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
726 #ifdef CONFIG_TCP_MD5SIG
727         rcu_read_lock();
728         hash_location = tcp_parse_md5sig_option(th);
729         if (sk && sk_fullsock(sk)) {
730                 const union tcp_md5_addr *addr;
731                 int l3index;
732
733                 /* sdif set, means packet ingressed via a device
734                  * in an L3 domain and inet_iif is set to it.
735                  */
736                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
737                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
738                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
739         } else if (hash_location) {
740                 const union tcp_md5_addr *addr;
741                 int sdif = tcp_v4_sdif(skb);
742                 int dif = inet_iif(skb);
743                 int l3index;
744
745                 /*
746                  * active side is lost. Try to find listening socket through
747                  * source port, and then find md5 key through listening socket.
748                  * we are not loose security here:
749                  * Incoming packet is checked with md5 hash with finding key,
750                  * no RST generated if md5 hash doesn't match.
751                  */
752                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
753                                              NULL, 0, ip_hdr(skb)->saddr,
754                                              th->source, ip_hdr(skb)->daddr,
755                                              ntohs(th->source), dif, sdif);
756                 /* don't send rst if it can't find key */
757                 if (!sk1)
758                         goto out;
759
760                 /* sdif set, means packet ingressed via a device
761                  * in an L3 domain and dif is set to it.
762                  */
763                 l3index = sdif ? dif : 0;
764                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
765                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
766                 if (!key)
767                         goto out;
768
769
770                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
771                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
772                         goto out;
773
774         }
775
776         if (key) {
777                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
778                                    (TCPOPT_NOP << 16) |
779                                    (TCPOPT_MD5SIG << 8) |
780                                    TCPOLEN_MD5SIG);
781                 /* Update length and the length the header thinks exists */
782                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
783                 rep.th.doff = arg.iov[0].iov_len / 4;
784
785                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
786                                      key, ip_hdr(skb)->saddr,
787                                      ip_hdr(skb)->daddr, &rep.th);
788         }
789 #endif
790         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
791         if (rep.opt[0] == 0) {
792                 __be32 mrst = mptcp_reset_option(skb);
793
794                 if (mrst) {
795                         rep.opt[0] = mrst;
796                         arg.iov[0].iov_len += sizeof(mrst);
797                         rep.th.doff = arg.iov[0].iov_len / 4;
798                 }
799         }
800
801         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
802                                       ip_hdr(skb)->saddr, /* XXX */
803                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
804         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
805         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
806
807         /* When socket is gone, all binding information is lost.
808          * routing might fail in this case. No choice here, if we choose to force
809          * input interface, we will misroute in case of asymmetric route.
810          */
811         if (sk) {
812                 arg.bound_dev_if = sk->sk_bound_dev_if;
813                 if (sk_fullsock(sk))
814                         trace_tcp_send_reset(sk, skb);
815         }
816
817         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
818                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
819
820         arg.tos = ip_hdr(skb)->tos;
821         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
822         local_bh_disable();
823         ctl_sk = this_cpu_read(ipv4_tcp_sk);
824         sock_net_set(ctl_sk, net);
825         if (sk) {
826                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
827                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
828                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
829                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
830                 transmit_time = tcp_transmit_time(sk);
831                 xfrm_sk_clone_policy(ctl_sk, sk);
832         }
833         ip_send_unicast_reply(ctl_sk,
834                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
835                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
836                               &arg, arg.iov[0].iov_len,
837                               transmit_time);
838
839         ctl_sk->sk_mark = 0;
840         xfrm_sk_free_policy(ctl_sk);
841         sock_net_set(ctl_sk, &init_net);
842         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
843         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
844         local_bh_enable();
845
846 #ifdef CONFIG_TCP_MD5SIG
847 out:
848         rcu_read_unlock();
849 #endif
850 }
851
852 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
853    outside socket context is ugly, certainly. What can I do?
854  */
855
856 static void tcp_v4_send_ack(const struct sock *sk,
857                             struct sk_buff *skb, u32 seq, u32 ack,
858                             u32 win, u32 tsval, u32 tsecr, int oif,
859                             struct tcp_md5sig_key *key,
860                             int reply_flags, u8 tos)
861 {
862         const struct tcphdr *th = tcp_hdr(skb);
863         struct {
864                 struct tcphdr th;
865                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
866 #ifdef CONFIG_TCP_MD5SIG
867                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
868 #endif
869                         ];
870         } rep;
871         struct net *net = sock_net(sk);
872         struct ip_reply_arg arg;
873         struct sock *ctl_sk;
874         u64 transmit_time;
875
876         memset(&rep.th, 0, sizeof(struct tcphdr));
877         memset(&arg, 0, sizeof(arg));
878
879         arg.iov[0].iov_base = (unsigned char *)&rep;
880         arg.iov[0].iov_len  = sizeof(rep.th);
881         if (tsecr) {
882                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
883                                    (TCPOPT_TIMESTAMP << 8) |
884                                    TCPOLEN_TIMESTAMP);
885                 rep.opt[1] = htonl(tsval);
886                 rep.opt[2] = htonl(tsecr);
887                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
888         }
889
890         /* Swap the send and the receive. */
891         rep.th.dest    = th->source;
892         rep.th.source  = th->dest;
893         rep.th.doff    = arg.iov[0].iov_len / 4;
894         rep.th.seq     = htonl(seq);
895         rep.th.ack_seq = htonl(ack);
896         rep.th.ack     = 1;
897         rep.th.window  = htons(win);
898
899 #ifdef CONFIG_TCP_MD5SIG
900         if (key) {
901                 int offset = (tsecr) ? 3 : 0;
902
903                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
904                                           (TCPOPT_NOP << 16) |
905                                           (TCPOPT_MD5SIG << 8) |
906                                           TCPOLEN_MD5SIG);
907                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
908                 rep.th.doff = arg.iov[0].iov_len/4;
909
910                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
911                                     key, ip_hdr(skb)->saddr,
912                                     ip_hdr(skb)->daddr, &rep.th);
913         }
914 #endif
915         arg.flags = reply_flags;
916         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
917                                       ip_hdr(skb)->saddr, /* XXX */
918                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
919         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
920         if (oif)
921                 arg.bound_dev_if = oif;
922         arg.tos = tos;
923         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
924         local_bh_disable();
925         ctl_sk = this_cpu_read(ipv4_tcp_sk);
926         sock_net_set(ctl_sk, net);
927         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
928                            inet_twsk(sk)->tw_mark : sk->sk_mark;
929         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
930                            inet_twsk(sk)->tw_priority : sk->sk_priority;
931         transmit_time = tcp_transmit_time(sk);
932         ip_send_unicast_reply(ctl_sk,
933                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
934                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
935                               &arg, arg.iov[0].iov_len,
936                               transmit_time);
937
938         ctl_sk->sk_mark = 0;
939         sock_net_set(ctl_sk, &init_net);
940         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
941         local_bh_enable();
942 }
943
944 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
945 {
946         struct inet_timewait_sock *tw = inet_twsk(sk);
947         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
948
949         tcp_v4_send_ack(sk, skb,
950                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
951                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
952                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
953                         tcptw->tw_ts_recent,
954                         tw->tw_bound_dev_if,
955                         tcp_twsk_md5_key(tcptw),
956                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
957                         tw->tw_tos
958                         );
959
960         inet_twsk_put(tw);
961 }
962
963 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
964                                   struct request_sock *req)
965 {
966         const union tcp_md5_addr *addr;
967         int l3index;
968
969         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
970          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
971          */
972         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
973                                              tcp_sk(sk)->snd_nxt;
974
975         /* RFC 7323 2.3
976          * The window field (SEG.WND) of every outgoing segment, with the
977          * exception of <SYN> segments, MUST be right-shifted by
978          * Rcv.Wind.Shift bits:
979          */
980         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
981         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
982         tcp_v4_send_ack(sk, skb, seq,
983                         tcp_rsk(req)->rcv_nxt,
984                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
985                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
986                         req->ts_recent,
987                         0,
988                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
989                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
990                         ip_hdr(skb)->tos);
991 }
992
993 /*
994  *      Send a SYN-ACK after having received a SYN.
995  *      This still operates on a request_sock only, not on a big
996  *      socket.
997  */
998 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
999                               struct flowi *fl,
1000                               struct request_sock *req,
1001                               struct tcp_fastopen_cookie *foc,
1002                               enum tcp_synack_type synack_type,
1003                               struct sk_buff *syn_skb)
1004 {
1005         const struct inet_request_sock *ireq = inet_rsk(req);
1006         struct flowi4 fl4;
1007         int err = -1;
1008         struct sk_buff *skb;
1009         u8 tos;
1010
1011         /* First, grab a route. */
1012         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1013                 return -1;
1014
1015         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1016
1017         if (skb) {
1018                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1019
1020                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1021                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1022                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1023                                 inet_sk(sk)->tos;
1024
1025                 if (!INET_ECN_is_capable(tos) &&
1026                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1027                         tos |= INET_ECN_ECT_0;
1028
1029                 rcu_read_lock();
1030                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1031                                             ireq->ir_rmt_addr,
1032                                             rcu_dereference(ireq->ireq_opt),
1033                                             tos);
1034                 rcu_read_unlock();
1035                 err = net_xmit_eval(err);
1036         }
1037
1038         return err;
1039 }
1040
1041 /*
1042  *      IPv4 request_sock destructor.
1043  */
1044 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1045 {
1046         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1047 }
1048
1049 #ifdef CONFIG_TCP_MD5SIG
1050 /*
1051  * RFC2385 MD5 checksumming requires a mapping of
1052  * IP address->MD5 Key.
1053  * We need to maintain these in the sk structure.
1054  */
1055
1056 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1057 EXPORT_SYMBOL(tcp_md5_needed);
1058
1059 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1060 {
1061         if (!old)
1062                 return true;
1063
1064         /* l3index always overrides non-l3index */
1065         if (old->l3index && new->l3index == 0)
1066                 return false;
1067         if (old->l3index == 0 && new->l3index)
1068                 return true;
1069
1070         return old->prefixlen < new->prefixlen;
1071 }
1072
1073 /* Find the Key structure for an address.  */
1074 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1075                                            const union tcp_md5_addr *addr,
1076                                            int family)
1077 {
1078         const struct tcp_sock *tp = tcp_sk(sk);
1079         struct tcp_md5sig_key *key;
1080         const struct tcp_md5sig_info *md5sig;
1081         __be32 mask;
1082         struct tcp_md5sig_key *best_match = NULL;
1083         bool match;
1084
1085         /* caller either holds rcu_read_lock() or socket lock */
1086         md5sig = rcu_dereference_check(tp->md5sig_info,
1087                                        lockdep_sock_is_held(sk));
1088         if (!md5sig)
1089                 return NULL;
1090
1091         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1092                                  lockdep_sock_is_held(sk)) {
1093                 if (key->family != family)
1094                         continue;
1095                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1096                         continue;
1097                 if (family == AF_INET) {
1098                         mask = inet_make_mask(key->prefixlen);
1099                         match = (key->addr.a4.s_addr & mask) ==
1100                                 (addr->a4.s_addr & mask);
1101 #if IS_ENABLED(CONFIG_IPV6)
1102                 } else if (family == AF_INET6) {
1103                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1104                                                   key->prefixlen);
1105 #endif
1106                 } else {
1107                         match = false;
1108                 }
1109
1110                 if (match && better_md5_match(best_match, key))
1111                         best_match = key;
1112         }
1113         return best_match;
1114 }
1115 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1116
1117 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1118                                                       const union tcp_md5_addr *addr,
1119                                                       int family, u8 prefixlen,
1120                                                       int l3index, u8 flags)
1121 {
1122         const struct tcp_sock *tp = tcp_sk(sk);
1123         struct tcp_md5sig_key *key;
1124         unsigned int size = sizeof(struct in_addr);
1125         const struct tcp_md5sig_info *md5sig;
1126
1127         /* caller either holds rcu_read_lock() or socket lock */
1128         md5sig = rcu_dereference_check(tp->md5sig_info,
1129                                        lockdep_sock_is_held(sk));
1130         if (!md5sig)
1131                 return NULL;
1132 #if IS_ENABLED(CONFIG_IPV6)
1133         if (family == AF_INET6)
1134                 size = sizeof(struct in6_addr);
1135 #endif
1136         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1137                                  lockdep_sock_is_held(sk)) {
1138                 if (key->family != family)
1139                         continue;
1140                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1141                         continue;
1142                 if (key->l3index != l3index)
1143                         continue;
1144                 if (!memcmp(&key->addr, addr, size) &&
1145                     key->prefixlen == prefixlen)
1146                         return key;
1147         }
1148         return NULL;
1149 }
1150
1151 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1152                                          const struct sock *addr_sk)
1153 {
1154         const union tcp_md5_addr *addr;
1155         int l3index;
1156
1157         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1158                                                  addr_sk->sk_bound_dev_if);
1159         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1160         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1161 }
1162 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1163
1164 /* This can be called on a newly created socket, from other files */
1165 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1166                    int family, u8 prefixlen, int l3index, u8 flags,
1167                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1168 {
1169         /* Add Key to the list */
1170         struct tcp_md5sig_key *key;
1171         struct tcp_sock *tp = tcp_sk(sk);
1172         struct tcp_md5sig_info *md5sig;
1173
1174         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1175         if (key) {
1176                 /* Pre-existing entry - just update that one.
1177                  * Note that the key might be used concurrently.
1178                  * data_race() is telling kcsan that we do not care of
1179                  * key mismatches, since changing MD5 key on live flows
1180                  * can lead to packet drops.
1181                  */
1182                 data_race(memcpy(key->key, newkey, newkeylen));
1183
1184                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1185                  * Also note that a reader could catch new key->keylen value
1186                  * but old key->key[], this is the reason we use __GFP_ZERO
1187                  * at sock_kmalloc() time below these lines.
1188                  */
1189                 WRITE_ONCE(key->keylen, newkeylen);
1190
1191                 return 0;
1192         }
1193
1194         md5sig = rcu_dereference_protected(tp->md5sig_info,
1195                                            lockdep_sock_is_held(sk));
1196         if (!md5sig) {
1197                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1198                 if (!md5sig)
1199                         return -ENOMEM;
1200
1201                 sk_gso_disable(sk);
1202                 INIT_HLIST_HEAD(&md5sig->head);
1203                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1204         }
1205
1206         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1207         if (!key)
1208                 return -ENOMEM;
1209         if (!tcp_alloc_md5sig_pool()) {
1210                 sock_kfree_s(sk, key, sizeof(*key));
1211                 return -ENOMEM;
1212         }
1213
1214         memcpy(key->key, newkey, newkeylen);
1215         key->keylen = newkeylen;
1216         key->family = family;
1217         key->prefixlen = prefixlen;
1218         key->l3index = l3index;
1219         key->flags = flags;
1220         memcpy(&key->addr, addr,
1221                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1222                                                                  sizeof(struct in_addr));
1223         hlist_add_head_rcu(&key->node, &md5sig->head);
1224         return 0;
1225 }
1226 EXPORT_SYMBOL(tcp_md5_do_add);
1227
1228 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1229                    u8 prefixlen, int l3index, u8 flags)
1230 {
1231         struct tcp_md5sig_key *key;
1232
1233         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1234         if (!key)
1235                 return -ENOENT;
1236         hlist_del_rcu(&key->node);
1237         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1238         kfree_rcu(key, rcu);
1239         return 0;
1240 }
1241 EXPORT_SYMBOL(tcp_md5_do_del);
1242
1243 static void tcp_clear_md5_list(struct sock *sk)
1244 {
1245         struct tcp_sock *tp = tcp_sk(sk);
1246         struct tcp_md5sig_key *key;
1247         struct hlist_node *n;
1248         struct tcp_md5sig_info *md5sig;
1249
1250         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1251
1252         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1253                 hlist_del_rcu(&key->node);
1254                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1255                 kfree_rcu(key, rcu);
1256         }
1257 }
1258
1259 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1260                                  sockptr_t optval, int optlen)
1261 {
1262         struct tcp_md5sig cmd;
1263         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1264         const union tcp_md5_addr *addr;
1265         u8 prefixlen = 32;
1266         int l3index = 0;
1267         u8 flags;
1268
1269         if (optlen < sizeof(cmd))
1270                 return -EINVAL;
1271
1272         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1273                 return -EFAULT;
1274
1275         if (sin->sin_family != AF_INET)
1276                 return -EINVAL;
1277
1278         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1279
1280         if (optname == TCP_MD5SIG_EXT &&
1281             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1282                 prefixlen = cmd.tcpm_prefixlen;
1283                 if (prefixlen > 32)
1284                         return -EINVAL;
1285         }
1286
1287         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1288             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1289                 struct net_device *dev;
1290
1291                 rcu_read_lock();
1292                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1293                 if (dev && netif_is_l3_master(dev))
1294                         l3index = dev->ifindex;
1295
1296                 rcu_read_unlock();
1297
1298                 /* ok to reference set/not set outside of rcu;
1299                  * right now device MUST be an L3 master
1300                  */
1301                 if (!dev || !l3index)
1302                         return -EINVAL;
1303         }
1304
1305         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1306
1307         if (!cmd.tcpm_keylen)
1308                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1309
1310         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1311                 return -EINVAL;
1312
1313         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1314                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1315 }
1316
1317 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1318                                    __be32 daddr, __be32 saddr,
1319                                    const struct tcphdr *th, int nbytes)
1320 {
1321         struct tcp4_pseudohdr *bp;
1322         struct scatterlist sg;
1323         struct tcphdr *_th;
1324
1325         bp = hp->scratch;
1326         bp->saddr = saddr;
1327         bp->daddr = daddr;
1328         bp->pad = 0;
1329         bp->protocol = IPPROTO_TCP;
1330         bp->len = cpu_to_be16(nbytes);
1331
1332         _th = (struct tcphdr *)(bp + 1);
1333         memcpy(_th, th, sizeof(*th));
1334         _th->check = 0;
1335
1336         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1337         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1338                                 sizeof(*bp) + sizeof(*th));
1339         return crypto_ahash_update(hp->md5_req);
1340 }
1341
1342 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1343                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1344 {
1345         struct tcp_md5sig_pool *hp;
1346         struct ahash_request *req;
1347
1348         hp = tcp_get_md5sig_pool();
1349         if (!hp)
1350                 goto clear_hash_noput;
1351         req = hp->md5_req;
1352
1353         if (crypto_ahash_init(req))
1354                 goto clear_hash;
1355         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1356                 goto clear_hash;
1357         if (tcp_md5_hash_key(hp, key))
1358                 goto clear_hash;
1359         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1360         if (crypto_ahash_final(req))
1361                 goto clear_hash;
1362
1363         tcp_put_md5sig_pool();
1364         return 0;
1365
1366 clear_hash:
1367         tcp_put_md5sig_pool();
1368 clear_hash_noput:
1369         memset(md5_hash, 0, 16);
1370         return 1;
1371 }
1372
1373 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1374                         const struct sock *sk,
1375                         const struct sk_buff *skb)
1376 {
1377         struct tcp_md5sig_pool *hp;
1378         struct ahash_request *req;
1379         const struct tcphdr *th = tcp_hdr(skb);
1380         __be32 saddr, daddr;
1381
1382         if (sk) { /* valid for establish/request sockets */
1383                 saddr = sk->sk_rcv_saddr;
1384                 daddr = sk->sk_daddr;
1385         } else {
1386                 const struct iphdr *iph = ip_hdr(skb);
1387                 saddr = iph->saddr;
1388                 daddr = iph->daddr;
1389         }
1390
1391         hp = tcp_get_md5sig_pool();
1392         if (!hp)
1393                 goto clear_hash_noput;
1394         req = hp->md5_req;
1395
1396         if (crypto_ahash_init(req))
1397                 goto clear_hash;
1398
1399         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1400                 goto clear_hash;
1401         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1402                 goto clear_hash;
1403         if (tcp_md5_hash_key(hp, key))
1404                 goto clear_hash;
1405         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1406         if (crypto_ahash_final(req))
1407                 goto clear_hash;
1408
1409         tcp_put_md5sig_pool();
1410         return 0;
1411
1412 clear_hash:
1413         tcp_put_md5sig_pool();
1414 clear_hash_noput:
1415         memset(md5_hash, 0, 16);
1416         return 1;
1417 }
1418 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1419
1420 #endif
1421
1422 static void tcp_v4_init_req(struct request_sock *req,
1423                             const struct sock *sk_listener,
1424                             struct sk_buff *skb)
1425 {
1426         struct inet_request_sock *ireq = inet_rsk(req);
1427         struct net *net = sock_net(sk_listener);
1428
1429         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1430         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1431         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1432 }
1433
1434 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1435                                           struct sk_buff *skb,
1436                                           struct flowi *fl,
1437                                           struct request_sock *req)
1438 {
1439         tcp_v4_init_req(req, sk, skb);
1440
1441         if (security_inet_conn_request(sk, skb, req))
1442                 return NULL;
1443
1444         return inet_csk_route_req(sk, &fl->u.ip4, req);
1445 }
1446
1447 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1448         .family         =       PF_INET,
1449         .obj_size       =       sizeof(struct tcp_request_sock),
1450         .rtx_syn_ack    =       tcp_rtx_synack,
1451         .send_ack       =       tcp_v4_reqsk_send_ack,
1452         .destructor     =       tcp_v4_reqsk_destructor,
1453         .send_reset     =       tcp_v4_send_reset,
1454         .syn_ack_timeout =      tcp_syn_ack_timeout,
1455 };
1456
1457 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1458         .mss_clamp      =       TCP_MSS_DEFAULT,
1459 #ifdef CONFIG_TCP_MD5SIG
1460         .req_md5_lookup =       tcp_v4_md5_lookup,
1461         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1462 #endif
1463 #ifdef CONFIG_SYN_COOKIES
1464         .cookie_init_seq =      cookie_v4_init_sequence,
1465 #endif
1466         .route_req      =       tcp_v4_route_req,
1467         .init_seq       =       tcp_v4_init_seq,
1468         .init_ts_off    =       tcp_v4_init_ts_off,
1469         .send_synack    =       tcp_v4_send_synack,
1470 };
1471
1472 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1473 {
1474         /* Never answer to SYNs send to broadcast or multicast */
1475         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1476                 goto drop;
1477
1478         return tcp_conn_request(&tcp_request_sock_ops,
1479                                 &tcp_request_sock_ipv4_ops, sk, skb);
1480
1481 drop:
1482         tcp_listendrop(sk);
1483         return 0;
1484 }
1485 EXPORT_SYMBOL(tcp_v4_conn_request);
1486
1487
1488 /*
1489  * The three way handshake has completed - we got a valid synack -
1490  * now create the new socket.
1491  */
1492 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1493                                   struct request_sock *req,
1494                                   struct dst_entry *dst,
1495                                   struct request_sock *req_unhash,
1496                                   bool *own_req)
1497 {
1498         struct inet_request_sock *ireq;
1499         bool found_dup_sk = false;
1500         struct inet_sock *newinet;
1501         struct tcp_sock *newtp;
1502         struct sock *newsk;
1503 #ifdef CONFIG_TCP_MD5SIG
1504         const union tcp_md5_addr *addr;
1505         struct tcp_md5sig_key *key;
1506         int l3index;
1507 #endif
1508         struct ip_options_rcu *inet_opt;
1509
1510         if (sk_acceptq_is_full(sk))
1511                 goto exit_overflow;
1512
1513         newsk = tcp_create_openreq_child(sk, req, skb);
1514         if (!newsk)
1515                 goto exit_nonewsk;
1516
1517         newsk->sk_gso_type = SKB_GSO_TCPV4;
1518         inet_sk_rx_dst_set(newsk, skb);
1519
1520         newtp                 = tcp_sk(newsk);
1521         newinet               = inet_sk(newsk);
1522         ireq                  = inet_rsk(req);
1523         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1524         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1525         newsk->sk_bound_dev_if = ireq->ir_iif;
1526         newinet->inet_saddr   = ireq->ir_loc_addr;
1527         inet_opt              = rcu_dereference(ireq->ireq_opt);
1528         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1529         newinet->mc_index     = inet_iif(skb);
1530         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1531         newinet->rcv_tos      = ip_hdr(skb)->tos;
1532         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1533         if (inet_opt)
1534                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1535         newinet->inet_id = get_random_u16();
1536
1537         /* Set ToS of the new socket based upon the value of incoming SYN.
1538          * ECT bits are set later in tcp_init_transfer().
1539          */
1540         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1541                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1542
1543         if (!dst) {
1544                 dst = inet_csk_route_child_sock(sk, newsk, req);
1545                 if (!dst)
1546                         goto put_and_exit;
1547         } else {
1548                 /* syncookie case : see end of cookie_v4_check() */
1549         }
1550         sk_setup_caps(newsk, dst);
1551
1552         tcp_ca_openreq_child(newsk, dst);
1553
1554         tcp_sync_mss(newsk, dst_mtu(dst));
1555         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1556
1557         tcp_initialize_rcv_mss(newsk);
1558
1559 #ifdef CONFIG_TCP_MD5SIG
1560         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1561         /* Copy over the MD5 key from the original socket */
1562         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1563         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1564         if (key) {
1565                 /*
1566                  * We're using one, so create a matching key
1567                  * on the newsk structure. If we fail to get
1568                  * memory, then we end up not copying the key
1569                  * across. Shucks.
1570                  */
1571                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1572                                key->key, key->keylen, GFP_ATOMIC);
1573                 sk_gso_disable(newsk);
1574         }
1575 #endif
1576
1577         if (__inet_inherit_port(sk, newsk) < 0)
1578                 goto put_and_exit;
1579         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1580                                        &found_dup_sk);
1581         if (likely(*own_req)) {
1582                 tcp_move_syn(newtp, req);
1583                 ireq->ireq_opt = NULL;
1584         } else {
1585                 newinet->inet_opt = NULL;
1586
1587                 if (!req_unhash && found_dup_sk) {
1588                         /* This code path should only be executed in the
1589                          * syncookie case only
1590                          */
1591                         bh_unlock_sock(newsk);
1592                         sock_put(newsk);
1593                         newsk = NULL;
1594                 }
1595         }
1596         return newsk;
1597
1598 exit_overflow:
1599         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1600 exit_nonewsk:
1601         dst_release(dst);
1602 exit:
1603         tcp_listendrop(sk);
1604         return NULL;
1605 put_and_exit:
1606         newinet->inet_opt = NULL;
1607         inet_csk_prepare_forced_close(newsk);
1608         tcp_done(newsk);
1609         goto exit;
1610 }
1611 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1612
1613 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1614 {
1615 #ifdef CONFIG_SYN_COOKIES
1616         const struct tcphdr *th = tcp_hdr(skb);
1617
1618         if (!th->syn)
1619                 sk = cookie_v4_check(sk, skb);
1620 #endif
1621         return sk;
1622 }
1623
1624 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1625                          struct tcphdr *th, u32 *cookie)
1626 {
1627         u16 mss = 0;
1628 #ifdef CONFIG_SYN_COOKIES
1629         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1630                                     &tcp_request_sock_ipv4_ops, sk, th);
1631         if (mss) {
1632                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1633                 tcp_synq_overflow(sk);
1634         }
1635 #endif
1636         return mss;
1637 }
1638
1639 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1640                                                            u32));
1641 /* The socket must have it's spinlock held when we get
1642  * here, unless it is a TCP_LISTEN socket.
1643  *
1644  * We have a potential double-lock case here, so even when
1645  * doing backlog processing we use the BH locking scheme.
1646  * This is because we cannot sleep with the original spinlock
1647  * held.
1648  */
1649 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1650 {
1651         enum skb_drop_reason reason;
1652         struct sock *rsk;
1653
1654         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1655                 struct dst_entry *dst;
1656
1657                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1658                                                 lockdep_sock_is_held(sk));
1659
1660                 sock_rps_save_rxhash(sk, skb);
1661                 sk_mark_napi_id(sk, skb);
1662                 if (dst) {
1663                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1664                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1665                                              dst, 0)) {
1666                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1667                                 dst_release(dst);
1668                         }
1669                 }
1670                 tcp_rcv_established(sk, skb);
1671                 return 0;
1672         }
1673
1674         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1675         if (tcp_checksum_complete(skb))
1676                 goto csum_err;
1677
1678         if (sk->sk_state == TCP_LISTEN) {
1679                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1680
1681                 if (!nsk)
1682                         goto discard;
1683                 if (nsk != sk) {
1684                         if (tcp_child_process(sk, nsk, skb)) {
1685                                 rsk = nsk;
1686                                 goto reset;
1687                         }
1688                         return 0;
1689                 }
1690         } else
1691                 sock_rps_save_rxhash(sk, skb);
1692
1693         if (tcp_rcv_state_process(sk, skb)) {
1694                 rsk = sk;
1695                 goto reset;
1696         }
1697         return 0;
1698
1699 reset:
1700         tcp_v4_send_reset(rsk, skb);
1701 discard:
1702         kfree_skb_reason(skb, reason);
1703         /* Be careful here. If this function gets more complicated and
1704          * gcc suffers from register pressure on the x86, sk (in %ebx)
1705          * might be destroyed here. This current version compiles correctly,
1706          * but you have been warned.
1707          */
1708         return 0;
1709
1710 csum_err:
1711         reason = SKB_DROP_REASON_TCP_CSUM;
1712         trace_tcp_bad_csum(skb);
1713         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1714         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1715         goto discard;
1716 }
1717 EXPORT_SYMBOL(tcp_v4_do_rcv);
1718
1719 int tcp_v4_early_demux(struct sk_buff *skb)
1720 {
1721         struct net *net = dev_net(skb->dev);
1722         const struct iphdr *iph;
1723         const struct tcphdr *th;
1724         struct sock *sk;
1725
1726         if (skb->pkt_type != PACKET_HOST)
1727                 return 0;
1728
1729         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1730                 return 0;
1731
1732         iph = ip_hdr(skb);
1733         th = tcp_hdr(skb);
1734
1735         if (th->doff < sizeof(struct tcphdr) / 4)
1736                 return 0;
1737
1738         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1739                                        iph->saddr, th->source,
1740                                        iph->daddr, ntohs(th->dest),
1741                                        skb->skb_iif, inet_sdif(skb));
1742         if (sk) {
1743                 skb->sk = sk;
1744                 skb->destructor = sock_edemux;
1745                 if (sk_fullsock(sk)) {
1746                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1747
1748                         if (dst)
1749                                 dst = dst_check(dst, 0);
1750                         if (dst &&
1751                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1752                                 skb_dst_set_noref(skb, dst);
1753                 }
1754         }
1755         return 0;
1756 }
1757
1758 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1759                      enum skb_drop_reason *reason)
1760 {
1761         u32 limit, tail_gso_size, tail_gso_segs;
1762         struct skb_shared_info *shinfo;
1763         const struct tcphdr *th;
1764         struct tcphdr *thtail;
1765         struct sk_buff *tail;
1766         unsigned int hdrlen;
1767         bool fragstolen;
1768         u32 gso_segs;
1769         u32 gso_size;
1770         int delta;
1771
1772         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1773          * we can fix skb->truesize to its real value to avoid future drops.
1774          * This is valid because skb is not yet charged to the socket.
1775          * It has been noticed pure SACK packets were sometimes dropped
1776          * (if cooked by drivers without copybreak feature).
1777          */
1778         skb_condense(skb);
1779
1780         skb_dst_drop(skb);
1781
1782         if (unlikely(tcp_checksum_complete(skb))) {
1783                 bh_unlock_sock(sk);
1784                 trace_tcp_bad_csum(skb);
1785                 *reason = SKB_DROP_REASON_TCP_CSUM;
1786                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1787                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1788                 return true;
1789         }
1790
1791         /* Attempt coalescing to last skb in backlog, even if we are
1792          * above the limits.
1793          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1794          */
1795         th = (const struct tcphdr *)skb->data;
1796         hdrlen = th->doff * 4;
1797
1798         tail = sk->sk_backlog.tail;
1799         if (!tail)
1800                 goto no_coalesce;
1801         thtail = (struct tcphdr *)tail->data;
1802
1803         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1804             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1805             ((TCP_SKB_CB(tail)->tcp_flags |
1806               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1807             !((TCP_SKB_CB(tail)->tcp_flags &
1808               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1809             ((TCP_SKB_CB(tail)->tcp_flags ^
1810               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1811 #ifdef CONFIG_TLS_DEVICE
1812             tail->decrypted != skb->decrypted ||
1813 #endif
1814             thtail->doff != th->doff ||
1815             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1816                 goto no_coalesce;
1817
1818         __skb_pull(skb, hdrlen);
1819
1820         shinfo = skb_shinfo(skb);
1821         gso_size = shinfo->gso_size ?: skb->len;
1822         gso_segs = shinfo->gso_segs ?: 1;
1823
1824         shinfo = skb_shinfo(tail);
1825         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1826         tail_gso_segs = shinfo->gso_segs ?: 1;
1827
1828         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1829                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1830
1831                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1832                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1833                         thtail->window = th->window;
1834                 }
1835
1836                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1837                  * thtail->fin, so that the fast path in tcp_rcv_established()
1838                  * is not entered if we append a packet with a FIN.
1839                  * SYN, RST, URG are not present.
1840                  * ACK is set on both packets.
1841                  * PSH : we do not really care in TCP stack,
1842                  *       at least for 'GRO' packets.
1843                  */
1844                 thtail->fin |= th->fin;
1845                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1846
1847                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1848                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1849                         tail->tstamp = skb->tstamp;
1850                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1851                 }
1852
1853                 /* Not as strict as GRO. We only need to carry mss max value */
1854                 shinfo->gso_size = max(gso_size, tail_gso_size);
1855                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1856
1857                 sk->sk_backlog.len += delta;
1858                 __NET_INC_STATS(sock_net(sk),
1859                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1860                 kfree_skb_partial(skb, fragstolen);
1861                 return false;
1862         }
1863         __skb_push(skb, hdrlen);
1864
1865 no_coalesce:
1866         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1867
1868         /* Only socket owner can try to collapse/prune rx queues
1869          * to reduce memory overhead, so add a little headroom here.
1870          * Few sockets backlog are possibly concurrently non empty.
1871          */
1872         limit += 64 * 1024;
1873
1874         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1875                 bh_unlock_sock(sk);
1876                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1877                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1878                 return true;
1879         }
1880         return false;
1881 }
1882 EXPORT_SYMBOL(tcp_add_backlog);
1883
1884 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1885 {
1886         struct tcphdr *th = (struct tcphdr *)skb->data;
1887
1888         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1889 }
1890 EXPORT_SYMBOL(tcp_filter);
1891
1892 static void tcp_v4_restore_cb(struct sk_buff *skb)
1893 {
1894         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1895                 sizeof(struct inet_skb_parm));
1896 }
1897
1898 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1899                            const struct tcphdr *th)
1900 {
1901         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1902          * barrier() makes sure compiler wont play fool^Waliasing games.
1903          */
1904         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1905                 sizeof(struct inet_skb_parm));
1906         barrier();
1907
1908         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1909         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1910                                     skb->len - th->doff * 4);
1911         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1912         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1913         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1914         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1915         TCP_SKB_CB(skb)->sacked  = 0;
1916         TCP_SKB_CB(skb)->has_rxtstamp =
1917                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1918 }
1919
1920 /*
1921  *      From tcp_input.c
1922  */
1923
1924 int tcp_v4_rcv(struct sk_buff *skb)
1925 {
1926         struct net *net = dev_net(skb->dev);
1927         enum skb_drop_reason drop_reason;
1928         int sdif = inet_sdif(skb);
1929         int dif = inet_iif(skb);
1930         const struct iphdr *iph;
1931         const struct tcphdr *th;
1932         bool refcounted;
1933         struct sock *sk;
1934         int ret;
1935
1936         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1937         if (skb->pkt_type != PACKET_HOST)
1938                 goto discard_it;
1939
1940         /* Count it even if it's bad */
1941         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1942
1943         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1944                 goto discard_it;
1945
1946         th = (const struct tcphdr *)skb->data;
1947
1948         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1949                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1950                 goto bad_packet;
1951         }
1952         if (!pskb_may_pull(skb, th->doff * 4))
1953                 goto discard_it;
1954
1955         /* An explanation is required here, I think.
1956          * Packet length and doff are validated by header prediction,
1957          * provided case of th->doff==0 is eliminated.
1958          * So, we defer the checks. */
1959
1960         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1961                 goto csum_error;
1962
1963         th = (const struct tcphdr *)skb->data;
1964         iph = ip_hdr(skb);
1965 lookup:
1966         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1967                                skb, __tcp_hdrlen(th), th->source,
1968                                th->dest, sdif, &refcounted);
1969         if (!sk)
1970                 goto no_tcp_socket;
1971
1972 process:
1973         if (sk->sk_state == TCP_TIME_WAIT)
1974                 goto do_time_wait;
1975
1976         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1977                 struct request_sock *req = inet_reqsk(sk);
1978                 bool req_stolen = false;
1979                 struct sock *nsk;
1980
1981                 sk = req->rsk_listener;
1982                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1983                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1984                 else
1985                         drop_reason = tcp_inbound_md5_hash(sk, skb,
1986                                                    &iph->saddr, &iph->daddr,
1987                                                    AF_INET, dif, sdif);
1988                 if (unlikely(drop_reason)) {
1989                         sk_drops_add(sk, skb);
1990                         reqsk_put(req);
1991                         goto discard_it;
1992                 }
1993                 if (tcp_checksum_complete(skb)) {
1994                         reqsk_put(req);
1995                         goto csum_error;
1996                 }
1997                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1998                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1999                         if (!nsk) {
2000                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2001                                 goto lookup;
2002                         }
2003                         sk = nsk;
2004                         /* reuseport_migrate_sock() has already held one sk_refcnt
2005                          * before returning.
2006                          */
2007                 } else {
2008                         /* We own a reference on the listener, increase it again
2009                          * as we might lose it too soon.
2010                          */
2011                         sock_hold(sk);
2012                 }
2013                 refcounted = true;
2014                 nsk = NULL;
2015                 if (!tcp_filter(sk, skb)) {
2016                         th = (const struct tcphdr *)skb->data;
2017                         iph = ip_hdr(skb);
2018                         tcp_v4_fill_cb(skb, iph, th);
2019                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2020                 } else {
2021                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2022                 }
2023                 if (!nsk) {
2024                         reqsk_put(req);
2025                         if (req_stolen) {
2026                                 /* Another cpu got exclusive access to req
2027                                  * and created a full blown socket.
2028                                  * Try to feed this packet to this socket
2029                                  * instead of discarding it.
2030                                  */
2031                                 tcp_v4_restore_cb(skb);
2032                                 sock_put(sk);
2033                                 goto lookup;
2034                         }
2035                         goto discard_and_relse;
2036                 }
2037                 nf_reset_ct(skb);
2038                 if (nsk == sk) {
2039                         reqsk_put(req);
2040                         tcp_v4_restore_cb(skb);
2041                 } else if (tcp_child_process(sk, nsk, skb)) {
2042                         tcp_v4_send_reset(nsk, skb);
2043                         goto discard_and_relse;
2044                 } else {
2045                         sock_put(sk);
2046                         return 0;
2047                 }
2048         }
2049
2050         if (static_branch_unlikely(&ip4_min_ttl)) {
2051                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2052                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2053                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2054                         goto discard_and_relse;
2055                 }
2056         }
2057
2058         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2059                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2060                 goto discard_and_relse;
2061         }
2062
2063         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2064                                            &iph->daddr, AF_INET, dif, sdif);
2065         if (drop_reason)
2066                 goto discard_and_relse;
2067
2068         nf_reset_ct(skb);
2069
2070         if (tcp_filter(sk, skb)) {
2071                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2072                 goto discard_and_relse;
2073         }
2074         th = (const struct tcphdr *)skb->data;
2075         iph = ip_hdr(skb);
2076         tcp_v4_fill_cb(skb, iph, th);
2077
2078         skb->dev = NULL;
2079
2080         if (sk->sk_state == TCP_LISTEN) {
2081                 ret = tcp_v4_do_rcv(sk, skb);
2082                 goto put_and_return;
2083         }
2084
2085         sk_incoming_cpu_update(sk);
2086
2087         bh_lock_sock_nested(sk);
2088         tcp_segs_in(tcp_sk(sk), skb);
2089         ret = 0;
2090         if (!sock_owned_by_user(sk)) {
2091                 ret = tcp_v4_do_rcv(sk, skb);
2092         } else {
2093                 if (tcp_add_backlog(sk, skb, &drop_reason))
2094                         goto discard_and_relse;
2095         }
2096         bh_unlock_sock(sk);
2097
2098 put_and_return:
2099         if (refcounted)
2100                 sock_put(sk);
2101
2102         return ret;
2103
2104 no_tcp_socket:
2105         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2106         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2107                 goto discard_it;
2108
2109         tcp_v4_fill_cb(skb, iph, th);
2110
2111         if (tcp_checksum_complete(skb)) {
2112 csum_error:
2113                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2114                 trace_tcp_bad_csum(skb);
2115                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2116 bad_packet:
2117                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2118         } else {
2119                 tcp_v4_send_reset(NULL, skb);
2120         }
2121
2122 discard_it:
2123         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2124         /* Discard frame. */
2125         kfree_skb_reason(skb, drop_reason);
2126         return 0;
2127
2128 discard_and_relse:
2129         sk_drops_add(sk, skb);
2130         if (refcounted)
2131                 sock_put(sk);
2132         goto discard_it;
2133
2134 do_time_wait:
2135         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2136                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2137                 inet_twsk_put(inet_twsk(sk));
2138                 goto discard_it;
2139         }
2140
2141         tcp_v4_fill_cb(skb, iph, th);
2142
2143         if (tcp_checksum_complete(skb)) {
2144                 inet_twsk_put(inet_twsk(sk));
2145                 goto csum_error;
2146         }
2147         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2148         case TCP_TW_SYN: {
2149                 struct sock *sk2 = inet_lookup_listener(net,
2150                                                         net->ipv4.tcp_death_row.hashinfo,
2151                                                         skb, __tcp_hdrlen(th),
2152                                                         iph->saddr, th->source,
2153                                                         iph->daddr, th->dest,
2154                                                         inet_iif(skb),
2155                                                         sdif);
2156                 if (sk2) {
2157                         inet_twsk_deschedule_put(inet_twsk(sk));
2158                         sk = sk2;
2159                         tcp_v4_restore_cb(skb);
2160                         refcounted = false;
2161                         goto process;
2162                 }
2163         }
2164                 /* to ACK */
2165                 fallthrough;
2166         case TCP_TW_ACK:
2167                 tcp_v4_timewait_ack(sk, skb);
2168                 break;
2169         case TCP_TW_RST:
2170                 tcp_v4_send_reset(sk, skb);
2171                 inet_twsk_deschedule_put(inet_twsk(sk));
2172                 goto discard_it;
2173         case TCP_TW_SUCCESS:;
2174         }
2175         goto discard_it;
2176 }
2177
2178 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2179         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2180         .twsk_unique    = tcp_twsk_unique,
2181         .twsk_destructor= tcp_twsk_destructor,
2182 };
2183
2184 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2185 {
2186         struct dst_entry *dst = skb_dst(skb);
2187
2188         if (dst && dst_hold_safe(dst)) {
2189                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2190                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2191         }
2192 }
2193 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2194
2195 const struct inet_connection_sock_af_ops ipv4_specific = {
2196         .queue_xmit        = ip_queue_xmit,
2197         .send_check        = tcp_v4_send_check,
2198         .rebuild_header    = inet_sk_rebuild_header,
2199         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2200         .conn_request      = tcp_v4_conn_request,
2201         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2202         .net_header_len    = sizeof(struct iphdr),
2203         .setsockopt        = ip_setsockopt,
2204         .getsockopt        = ip_getsockopt,
2205         .addr2sockaddr     = inet_csk_addr2sockaddr,
2206         .sockaddr_len      = sizeof(struct sockaddr_in),
2207         .mtu_reduced       = tcp_v4_mtu_reduced,
2208 };
2209 EXPORT_SYMBOL(ipv4_specific);
2210
2211 #ifdef CONFIG_TCP_MD5SIG
2212 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2213         .md5_lookup             = tcp_v4_md5_lookup,
2214         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2215         .md5_parse              = tcp_v4_parse_md5_keys,
2216 };
2217 #endif
2218
2219 /* NOTE: A lot of things set to zero explicitly by call to
2220  *       sk_alloc() so need not be done here.
2221  */
2222 static int tcp_v4_init_sock(struct sock *sk)
2223 {
2224         struct inet_connection_sock *icsk = inet_csk(sk);
2225
2226         tcp_init_sock(sk);
2227
2228         icsk->icsk_af_ops = &ipv4_specific;
2229
2230 #ifdef CONFIG_TCP_MD5SIG
2231         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2232 #endif
2233
2234         return 0;
2235 }
2236
2237 void tcp_v4_destroy_sock(struct sock *sk)
2238 {
2239         struct tcp_sock *tp = tcp_sk(sk);
2240
2241         trace_tcp_destroy_sock(sk);
2242
2243         tcp_clear_xmit_timers(sk);
2244
2245         tcp_cleanup_congestion_control(sk);
2246
2247         tcp_cleanup_ulp(sk);
2248
2249         /* Cleanup up the write buffer. */
2250         tcp_write_queue_purge(sk);
2251
2252         /* Check if we want to disable active TFO */
2253         tcp_fastopen_active_disable_ofo_check(sk);
2254
2255         /* Cleans up our, hopefully empty, out_of_order_queue. */
2256         skb_rbtree_purge(&tp->out_of_order_queue);
2257
2258 #ifdef CONFIG_TCP_MD5SIG
2259         /* Clean up the MD5 key list, if any */
2260         if (tp->md5sig_info) {
2261                 tcp_clear_md5_list(sk);
2262                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2263                 tp->md5sig_info = NULL;
2264         }
2265 #endif
2266
2267         /* Clean up a referenced TCP bind bucket. */
2268         if (inet_csk(sk)->icsk_bind_hash)
2269                 inet_put_port(sk);
2270
2271         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2272
2273         /* If socket is aborted during connect operation */
2274         tcp_free_fastopen_req(tp);
2275         tcp_fastopen_destroy_cipher(sk);
2276         tcp_saved_syn_free(tp);
2277
2278         sk_sockets_allocated_dec(sk);
2279 }
2280 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2281
2282 #ifdef CONFIG_PROC_FS
2283 /* Proc filesystem TCP sock list dumping. */
2284
2285 static unsigned short seq_file_family(const struct seq_file *seq);
2286
2287 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2288 {
2289         unsigned short family = seq_file_family(seq);
2290
2291         /* AF_UNSPEC is used as a match all */
2292         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2293                 net_eq(sock_net(sk), seq_file_net(seq)));
2294 }
2295
2296 /* Find a non empty bucket (starting from st->bucket)
2297  * and return the first sk from it.
2298  */
2299 static void *listening_get_first(struct seq_file *seq)
2300 {
2301         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2302         struct tcp_iter_state *st = seq->private;
2303
2304         st->offset = 0;
2305         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2306                 struct inet_listen_hashbucket *ilb2;
2307                 struct hlist_nulls_node *node;
2308                 struct sock *sk;
2309
2310                 ilb2 = &hinfo->lhash2[st->bucket];
2311                 if (hlist_nulls_empty(&ilb2->nulls_head))
2312                         continue;
2313
2314                 spin_lock(&ilb2->lock);
2315                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2316                         if (seq_sk_match(seq, sk))
2317                                 return sk;
2318                 }
2319                 spin_unlock(&ilb2->lock);
2320         }
2321
2322         return NULL;
2323 }
2324
2325 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2326  * If "cur" is the last one in the st->bucket,
2327  * call listening_get_first() to return the first sk of the next
2328  * non empty bucket.
2329  */
2330 static void *listening_get_next(struct seq_file *seq, void *cur)
2331 {
2332         struct tcp_iter_state *st = seq->private;
2333         struct inet_listen_hashbucket *ilb2;
2334         struct hlist_nulls_node *node;
2335         struct inet_hashinfo *hinfo;
2336         struct sock *sk = cur;
2337
2338         ++st->num;
2339         ++st->offset;
2340
2341         sk = sk_nulls_next(sk);
2342         sk_nulls_for_each_from(sk, node) {
2343                 if (seq_sk_match(seq, sk))
2344                         return sk;
2345         }
2346
2347         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2348         ilb2 = &hinfo->lhash2[st->bucket];
2349         spin_unlock(&ilb2->lock);
2350         ++st->bucket;
2351         return listening_get_first(seq);
2352 }
2353
2354 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2355 {
2356         struct tcp_iter_state *st = seq->private;
2357         void *rc;
2358
2359         st->bucket = 0;
2360         st->offset = 0;
2361         rc = listening_get_first(seq);
2362
2363         while (rc && *pos) {
2364                 rc = listening_get_next(seq, rc);
2365                 --*pos;
2366         }
2367         return rc;
2368 }
2369
2370 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2371                                 const struct tcp_iter_state *st)
2372 {
2373         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2374 }
2375
2376 /*
2377  * Get first established socket starting from bucket given in st->bucket.
2378  * If st->bucket is zero, the very first socket in the hash is returned.
2379  */
2380 static void *established_get_first(struct seq_file *seq)
2381 {
2382         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2383         struct tcp_iter_state *st = seq->private;
2384
2385         st->offset = 0;
2386         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2387                 struct sock *sk;
2388                 struct hlist_nulls_node *node;
2389                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2390
2391                 /* Lockless fast path for the common case of empty buckets */
2392                 if (empty_bucket(hinfo, st))
2393                         continue;
2394
2395                 spin_lock_bh(lock);
2396                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2397                         if (seq_sk_match(seq, sk))
2398                                 return sk;
2399                 }
2400                 spin_unlock_bh(lock);
2401         }
2402
2403         return NULL;
2404 }
2405
2406 static void *established_get_next(struct seq_file *seq, void *cur)
2407 {
2408         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2409         struct tcp_iter_state *st = seq->private;
2410         struct hlist_nulls_node *node;
2411         struct sock *sk = cur;
2412
2413         ++st->num;
2414         ++st->offset;
2415
2416         sk = sk_nulls_next(sk);
2417
2418         sk_nulls_for_each_from(sk, node) {
2419                 if (seq_sk_match(seq, sk))
2420                         return sk;
2421         }
2422
2423         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2424         ++st->bucket;
2425         return established_get_first(seq);
2426 }
2427
2428 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2429 {
2430         struct tcp_iter_state *st = seq->private;
2431         void *rc;
2432
2433         st->bucket = 0;
2434         rc = established_get_first(seq);
2435
2436         while (rc && pos) {
2437                 rc = established_get_next(seq, rc);
2438                 --pos;
2439         }
2440         return rc;
2441 }
2442
2443 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2444 {
2445         void *rc;
2446         struct tcp_iter_state *st = seq->private;
2447
2448         st->state = TCP_SEQ_STATE_LISTENING;
2449         rc        = listening_get_idx(seq, &pos);
2450
2451         if (!rc) {
2452                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2453                 rc        = established_get_idx(seq, pos);
2454         }
2455
2456         return rc;
2457 }
2458
2459 static void *tcp_seek_last_pos(struct seq_file *seq)
2460 {
2461         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2462         struct tcp_iter_state *st = seq->private;
2463         int bucket = st->bucket;
2464         int offset = st->offset;
2465         int orig_num = st->num;
2466         void *rc = NULL;
2467
2468         switch (st->state) {
2469         case TCP_SEQ_STATE_LISTENING:
2470                 if (st->bucket > hinfo->lhash2_mask)
2471                         break;
2472                 st->state = TCP_SEQ_STATE_LISTENING;
2473                 rc = listening_get_first(seq);
2474                 while (offset-- && rc && bucket == st->bucket)
2475                         rc = listening_get_next(seq, rc);
2476                 if (rc)
2477                         break;
2478                 st->bucket = 0;
2479                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2480                 fallthrough;
2481         case TCP_SEQ_STATE_ESTABLISHED:
2482                 if (st->bucket > hinfo->ehash_mask)
2483                         break;
2484                 rc = established_get_first(seq);
2485                 while (offset-- && rc && bucket == st->bucket)
2486                         rc = established_get_next(seq, rc);
2487         }
2488
2489         st->num = orig_num;
2490
2491         return rc;
2492 }
2493
2494 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2495 {
2496         struct tcp_iter_state *st = seq->private;
2497         void *rc;
2498
2499         if (*pos && *pos == st->last_pos) {
2500                 rc = tcp_seek_last_pos(seq);
2501                 if (rc)
2502                         goto out;
2503         }
2504
2505         st->state = TCP_SEQ_STATE_LISTENING;
2506         st->num = 0;
2507         st->bucket = 0;
2508         st->offset = 0;
2509         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2510
2511 out:
2512         st->last_pos = *pos;
2513         return rc;
2514 }
2515 EXPORT_SYMBOL(tcp_seq_start);
2516
2517 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2518 {
2519         struct tcp_iter_state *st = seq->private;
2520         void *rc = NULL;
2521
2522         if (v == SEQ_START_TOKEN) {
2523                 rc = tcp_get_idx(seq, 0);
2524                 goto out;
2525         }
2526
2527         switch (st->state) {
2528         case TCP_SEQ_STATE_LISTENING:
2529                 rc = listening_get_next(seq, v);
2530                 if (!rc) {
2531                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2532                         st->bucket = 0;
2533                         st->offset = 0;
2534                         rc        = established_get_first(seq);
2535                 }
2536                 break;
2537         case TCP_SEQ_STATE_ESTABLISHED:
2538                 rc = established_get_next(seq, v);
2539                 break;
2540         }
2541 out:
2542         ++*pos;
2543         st->last_pos = *pos;
2544         return rc;
2545 }
2546 EXPORT_SYMBOL(tcp_seq_next);
2547
2548 void tcp_seq_stop(struct seq_file *seq, void *v)
2549 {
2550         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2551         struct tcp_iter_state *st = seq->private;
2552
2553         switch (st->state) {
2554         case TCP_SEQ_STATE_LISTENING:
2555                 if (v != SEQ_START_TOKEN)
2556                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2557                 break;
2558         case TCP_SEQ_STATE_ESTABLISHED:
2559                 if (v)
2560                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2561                 break;
2562         }
2563 }
2564 EXPORT_SYMBOL(tcp_seq_stop);
2565
2566 static void get_openreq4(const struct request_sock *req,
2567                          struct seq_file *f, int i)
2568 {
2569         const struct inet_request_sock *ireq = inet_rsk(req);
2570         long delta = req->rsk_timer.expires - jiffies;
2571
2572         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2573                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2574                 i,
2575                 ireq->ir_loc_addr,
2576                 ireq->ir_num,
2577                 ireq->ir_rmt_addr,
2578                 ntohs(ireq->ir_rmt_port),
2579                 TCP_SYN_RECV,
2580                 0, 0, /* could print option size, but that is af dependent. */
2581                 1,    /* timers active (only the expire timer) */
2582                 jiffies_delta_to_clock_t(delta),
2583                 req->num_timeout,
2584                 from_kuid_munged(seq_user_ns(f),
2585                                  sock_i_uid(req->rsk_listener)),
2586                 0,  /* non standard timer */
2587                 0, /* open_requests have no inode */
2588                 0,
2589                 req);
2590 }
2591
2592 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2593 {
2594         int timer_active;
2595         unsigned long timer_expires;
2596         const struct tcp_sock *tp = tcp_sk(sk);
2597         const struct inet_connection_sock *icsk = inet_csk(sk);
2598         const struct inet_sock *inet = inet_sk(sk);
2599         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2600         __be32 dest = inet->inet_daddr;
2601         __be32 src = inet->inet_rcv_saddr;
2602         __u16 destp = ntohs(inet->inet_dport);
2603         __u16 srcp = ntohs(inet->inet_sport);
2604         int rx_queue;
2605         int state;
2606
2607         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2608             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2609             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2610                 timer_active    = 1;
2611                 timer_expires   = icsk->icsk_timeout;
2612         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2613                 timer_active    = 4;
2614                 timer_expires   = icsk->icsk_timeout;
2615         } else if (timer_pending(&sk->sk_timer)) {
2616                 timer_active    = 2;
2617                 timer_expires   = sk->sk_timer.expires;
2618         } else {
2619                 timer_active    = 0;
2620                 timer_expires = jiffies;
2621         }
2622
2623         state = inet_sk_state_load(sk);
2624         if (state == TCP_LISTEN)
2625                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2626         else
2627                 /* Because we don't lock the socket,
2628                  * we might find a transient negative value.
2629                  */
2630                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2631                                       READ_ONCE(tp->copied_seq), 0);
2632
2633         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2634                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2635                 i, src, srcp, dest, destp, state,
2636                 READ_ONCE(tp->write_seq) - tp->snd_una,
2637                 rx_queue,
2638                 timer_active,
2639                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2640                 icsk->icsk_retransmits,
2641                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2642                 icsk->icsk_probes_out,
2643                 sock_i_ino(sk),
2644                 refcount_read(&sk->sk_refcnt), sk,
2645                 jiffies_to_clock_t(icsk->icsk_rto),
2646                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2647                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2648                 tcp_snd_cwnd(tp),
2649                 state == TCP_LISTEN ?
2650                     fastopenq->max_qlen :
2651                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2652 }
2653
2654 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2655                                struct seq_file *f, int i)
2656 {
2657         long delta = tw->tw_timer.expires - jiffies;
2658         __be32 dest, src;
2659         __u16 destp, srcp;
2660
2661         dest  = tw->tw_daddr;
2662         src   = tw->tw_rcv_saddr;
2663         destp = ntohs(tw->tw_dport);
2664         srcp  = ntohs(tw->tw_sport);
2665
2666         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2667                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2668                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2669                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2670                 refcount_read(&tw->tw_refcnt), tw);
2671 }
2672
2673 #define TMPSZ 150
2674
2675 static int tcp4_seq_show(struct seq_file *seq, void *v)
2676 {
2677         struct tcp_iter_state *st;
2678         struct sock *sk = v;
2679
2680         seq_setwidth(seq, TMPSZ - 1);
2681         if (v == SEQ_START_TOKEN) {
2682                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2683                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2684                            "inode");
2685                 goto out;
2686         }
2687         st = seq->private;
2688
2689         if (sk->sk_state == TCP_TIME_WAIT)
2690                 get_timewait4_sock(v, seq, st->num);
2691         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2692                 get_openreq4(v, seq, st->num);
2693         else
2694                 get_tcp4_sock(v, seq, st->num);
2695 out:
2696         seq_pad(seq, '\n');
2697         return 0;
2698 }
2699
2700 #ifdef CONFIG_BPF_SYSCALL
2701 struct bpf_tcp_iter_state {
2702         struct tcp_iter_state state;
2703         unsigned int cur_sk;
2704         unsigned int end_sk;
2705         unsigned int max_sk;
2706         struct sock **batch;
2707         bool st_bucket_done;
2708 };
2709
2710 struct bpf_iter__tcp {
2711         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2712         __bpf_md_ptr(struct sock_common *, sk_common);
2713         uid_t uid __aligned(8);
2714 };
2715
2716 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2717                              struct sock_common *sk_common, uid_t uid)
2718 {
2719         struct bpf_iter__tcp ctx;
2720
2721         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2722         ctx.meta = meta;
2723         ctx.sk_common = sk_common;
2724         ctx.uid = uid;
2725         return bpf_iter_run_prog(prog, &ctx);
2726 }
2727
2728 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2729 {
2730         while (iter->cur_sk < iter->end_sk)
2731                 sock_put(iter->batch[iter->cur_sk++]);
2732 }
2733
2734 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2735                                       unsigned int new_batch_sz)
2736 {
2737         struct sock **new_batch;
2738
2739         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2740                              GFP_USER | __GFP_NOWARN);
2741         if (!new_batch)
2742                 return -ENOMEM;
2743
2744         bpf_iter_tcp_put_batch(iter);
2745         kvfree(iter->batch);
2746         iter->batch = new_batch;
2747         iter->max_sk = new_batch_sz;
2748
2749         return 0;
2750 }
2751
2752 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2753                                                  struct sock *start_sk)
2754 {
2755         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2756         struct bpf_tcp_iter_state *iter = seq->private;
2757         struct tcp_iter_state *st = &iter->state;
2758         struct hlist_nulls_node *node;
2759         unsigned int expected = 1;
2760         struct sock *sk;
2761
2762         sock_hold(start_sk);
2763         iter->batch[iter->end_sk++] = start_sk;
2764
2765         sk = sk_nulls_next(start_sk);
2766         sk_nulls_for_each_from(sk, node) {
2767                 if (seq_sk_match(seq, sk)) {
2768                         if (iter->end_sk < iter->max_sk) {
2769                                 sock_hold(sk);
2770                                 iter->batch[iter->end_sk++] = sk;
2771                         }
2772                         expected++;
2773                 }
2774         }
2775         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2776
2777         return expected;
2778 }
2779
2780 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2781                                                    struct sock *start_sk)
2782 {
2783         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2784         struct bpf_tcp_iter_state *iter = seq->private;
2785         struct tcp_iter_state *st = &iter->state;
2786         struct hlist_nulls_node *node;
2787         unsigned int expected = 1;
2788         struct sock *sk;
2789
2790         sock_hold(start_sk);
2791         iter->batch[iter->end_sk++] = start_sk;
2792
2793         sk = sk_nulls_next(start_sk);
2794         sk_nulls_for_each_from(sk, node) {
2795                 if (seq_sk_match(seq, sk)) {
2796                         if (iter->end_sk < iter->max_sk) {
2797                                 sock_hold(sk);
2798                                 iter->batch[iter->end_sk++] = sk;
2799                         }
2800                         expected++;
2801                 }
2802         }
2803         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2804
2805         return expected;
2806 }
2807
2808 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2809 {
2810         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2811         struct bpf_tcp_iter_state *iter = seq->private;
2812         struct tcp_iter_state *st = &iter->state;
2813         unsigned int expected;
2814         bool resized = false;
2815         struct sock *sk;
2816
2817         /* The st->bucket is done.  Directly advance to the next
2818          * bucket instead of having the tcp_seek_last_pos() to skip
2819          * one by one in the current bucket and eventually find out
2820          * it has to advance to the next bucket.
2821          */
2822         if (iter->st_bucket_done) {
2823                 st->offset = 0;
2824                 st->bucket++;
2825                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2826                     st->bucket > hinfo->lhash2_mask) {
2827                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2828                         st->bucket = 0;
2829                 }
2830         }
2831
2832 again:
2833         /* Get a new batch */
2834         iter->cur_sk = 0;
2835         iter->end_sk = 0;
2836         iter->st_bucket_done = false;
2837
2838         sk = tcp_seek_last_pos(seq);
2839         if (!sk)
2840                 return NULL; /* Done */
2841
2842         if (st->state == TCP_SEQ_STATE_LISTENING)
2843                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2844         else
2845                 expected = bpf_iter_tcp_established_batch(seq, sk);
2846
2847         if (iter->end_sk == expected) {
2848                 iter->st_bucket_done = true;
2849                 return sk;
2850         }
2851
2852         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2853                 resized = true;
2854                 goto again;
2855         }
2856
2857         return sk;
2858 }
2859
2860 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2861 {
2862         /* bpf iter does not support lseek, so it always
2863          * continue from where it was stop()-ped.
2864          */
2865         if (*pos)
2866                 return bpf_iter_tcp_batch(seq);
2867
2868         return SEQ_START_TOKEN;
2869 }
2870
2871 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2872 {
2873         struct bpf_tcp_iter_state *iter = seq->private;
2874         struct tcp_iter_state *st = &iter->state;
2875         struct sock *sk;
2876
2877         /* Whenever seq_next() is called, the iter->cur_sk is
2878          * done with seq_show(), so advance to the next sk in
2879          * the batch.
2880          */
2881         if (iter->cur_sk < iter->end_sk) {
2882                 /* Keeping st->num consistent in tcp_iter_state.
2883                  * bpf_iter_tcp does not use st->num.
2884                  * meta.seq_num is used instead.
2885                  */
2886                 st->num++;
2887                 /* Move st->offset to the next sk in the bucket such that
2888                  * the future start() will resume at st->offset in
2889                  * st->bucket.  See tcp_seek_last_pos().
2890                  */
2891                 st->offset++;
2892                 sock_put(iter->batch[iter->cur_sk++]);
2893         }
2894
2895         if (iter->cur_sk < iter->end_sk)
2896                 sk = iter->batch[iter->cur_sk];
2897         else
2898                 sk = bpf_iter_tcp_batch(seq);
2899
2900         ++*pos;
2901         /* Keeping st->last_pos consistent in tcp_iter_state.
2902          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2903          */
2904         st->last_pos = *pos;
2905         return sk;
2906 }
2907
2908 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2909 {
2910         struct bpf_iter_meta meta;
2911         struct bpf_prog *prog;
2912         struct sock *sk = v;
2913         bool slow;
2914         uid_t uid;
2915         int ret;
2916
2917         if (v == SEQ_START_TOKEN)
2918                 return 0;
2919
2920         if (sk_fullsock(sk))
2921                 slow = lock_sock_fast(sk);
2922
2923         if (unlikely(sk_unhashed(sk))) {
2924                 ret = SEQ_SKIP;
2925                 goto unlock;
2926         }
2927
2928         if (sk->sk_state == TCP_TIME_WAIT) {
2929                 uid = 0;
2930         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2931                 const struct request_sock *req = v;
2932
2933                 uid = from_kuid_munged(seq_user_ns(seq),
2934                                        sock_i_uid(req->rsk_listener));
2935         } else {
2936                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2937         }
2938
2939         meta.seq = seq;
2940         prog = bpf_iter_get_info(&meta, false);
2941         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2942
2943 unlock:
2944         if (sk_fullsock(sk))
2945                 unlock_sock_fast(sk, slow);
2946         return ret;
2947
2948 }
2949
2950 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2951 {
2952         struct bpf_tcp_iter_state *iter = seq->private;
2953         struct bpf_iter_meta meta;
2954         struct bpf_prog *prog;
2955
2956         if (!v) {
2957                 meta.seq = seq;
2958                 prog = bpf_iter_get_info(&meta, true);
2959                 if (prog)
2960                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2961         }
2962
2963         if (iter->cur_sk < iter->end_sk) {
2964                 bpf_iter_tcp_put_batch(iter);
2965                 iter->st_bucket_done = false;
2966         }
2967 }
2968
2969 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2970         .show           = bpf_iter_tcp_seq_show,
2971         .start          = bpf_iter_tcp_seq_start,
2972         .next           = bpf_iter_tcp_seq_next,
2973         .stop           = bpf_iter_tcp_seq_stop,
2974 };
2975 #endif
2976 static unsigned short seq_file_family(const struct seq_file *seq)
2977 {
2978         const struct tcp_seq_afinfo *afinfo;
2979
2980 #ifdef CONFIG_BPF_SYSCALL
2981         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2982         if (seq->op == &bpf_iter_tcp_seq_ops)
2983                 return AF_UNSPEC;
2984 #endif
2985
2986         /* Iterated from proc fs */
2987         afinfo = pde_data(file_inode(seq->file));
2988         return afinfo->family;
2989 }
2990
2991 static const struct seq_operations tcp4_seq_ops = {
2992         .show           = tcp4_seq_show,
2993         .start          = tcp_seq_start,
2994         .next           = tcp_seq_next,
2995         .stop           = tcp_seq_stop,
2996 };
2997
2998 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2999         .family         = AF_INET,
3000 };
3001
3002 static int __net_init tcp4_proc_init_net(struct net *net)
3003 {
3004         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3005                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3006                 return -ENOMEM;
3007         return 0;
3008 }
3009
3010 static void __net_exit tcp4_proc_exit_net(struct net *net)
3011 {
3012         remove_proc_entry("tcp", net->proc_net);
3013 }
3014
3015 static struct pernet_operations tcp4_net_ops = {
3016         .init = tcp4_proc_init_net,
3017         .exit = tcp4_proc_exit_net,
3018 };
3019
3020 int __init tcp4_proc_init(void)
3021 {
3022         return register_pernet_subsys(&tcp4_net_ops);
3023 }
3024
3025 void tcp4_proc_exit(void)
3026 {
3027         unregister_pernet_subsys(&tcp4_net_ops);
3028 }
3029 #endif /* CONFIG_PROC_FS */
3030
3031 /* @wake is one when sk_stream_write_space() calls us.
3032  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3033  * This mimics the strategy used in sock_def_write_space().
3034  */
3035 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3036 {
3037         const struct tcp_sock *tp = tcp_sk(sk);
3038         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3039                             READ_ONCE(tp->snd_nxt);
3040
3041         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3042 }
3043 EXPORT_SYMBOL(tcp_stream_memory_free);
3044
3045 struct proto tcp_prot = {
3046         .name                   = "TCP",
3047         .owner                  = THIS_MODULE,
3048         .close                  = tcp_close,
3049         .pre_connect            = tcp_v4_pre_connect,
3050         .connect                = tcp_v4_connect,
3051         .disconnect             = tcp_disconnect,
3052         .accept                 = inet_csk_accept,
3053         .ioctl                  = tcp_ioctl,
3054         .init                   = tcp_v4_init_sock,
3055         .destroy                = tcp_v4_destroy_sock,
3056         .shutdown               = tcp_shutdown,
3057         .setsockopt             = tcp_setsockopt,
3058         .getsockopt             = tcp_getsockopt,
3059         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3060         .keepalive              = tcp_set_keepalive,
3061         .recvmsg                = tcp_recvmsg,
3062         .sendmsg                = tcp_sendmsg,
3063         .sendpage               = tcp_sendpage,
3064         .backlog_rcv            = tcp_v4_do_rcv,
3065         .release_cb             = tcp_release_cb,
3066         .hash                   = inet_hash,
3067         .unhash                 = inet_unhash,
3068         .get_port               = inet_csk_get_port,
3069         .put_port               = inet_put_port,
3070 #ifdef CONFIG_BPF_SYSCALL
3071         .psock_update_sk_prot   = tcp_bpf_update_proto,
3072 #endif
3073         .enter_memory_pressure  = tcp_enter_memory_pressure,
3074         .leave_memory_pressure  = tcp_leave_memory_pressure,
3075         .stream_memory_free     = tcp_stream_memory_free,
3076         .sockets_allocated      = &tcp_sockets_allocated,
3077         .orphan_count           = &tcp_orphan_count,
3078
3079         .memory_allocated       = &tcp_memory_allocated,
3080         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3081
3082         .memory_pressure        = &tcp_memory_pressure,
3083         .sysctl_mem             = sysctl_tcp_mem,
3084         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3085         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3086         .max_header             = MAX_TCP_HEADER,
3087         .obj_size               = sizeof(struct tcp_sock),
3088         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3089         .twsk_prot              = &tcp_timewait_sock_ops,
3090         .rsk_prot               = &tcp_request_sock_ops,
3091         .h.hashinfo             = NULL,
3092         .no_autobind            = true,
3093         .diag_destroy           = tcp_abort,
3094 };
3095 EXPORT_SYMBOL(tcp_prot);
3096
3097 static void __net_exit tcp_sk_exit(struct net *net)
3098 {
3099         if (net->ipv4.tcp_congestion_control)
3100                 bpf_module_put(net->ipv4.tcp_congestion_control,
3101                                net->ipv4.tcp_congestion_control->owner);
3102 }
3103
3104 static void __net_init tcp_set_hashinfo(struct net *net)
3105 {
3106         struct inet_hashinfo *hinfo;
3107         unsigned int ehash_entries;
3108         struct net *old_net;
3109
3110         if (net_eq(net, &init_net))
3111                 goto fallback;
3112
3113         old_net = current->nsproxy->net_ns;
3114         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3115         if (!ehash_entries)
3116                 goto fallback;
3117
3118         ehash_entries = roundup_pow_of_two(ehash_entries);
3119         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3120         if (!hinfo) {
3121                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3122                         "for a netns, fallback to the global one\n",
3123                         ehash_entries);
3124 fallback:
3125                 hinfo = &tcp_hashinfo;
3126                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3127         }
3128
3129         net->ipv4.tcp_death_row.hashinfo = hinfo;
3130         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3131         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3132 }
3133
3134 static int __net_init tcp_sk_init(struct net *net)
3135 {
3136         net->ipv4.sysctl_tcp_ecn = 2;
3137         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3138
3139         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3140         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3141         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3142         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3143         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3144
3145         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3146         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3147         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3148
3149         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3150         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3151         net->ipv4.sysctl_tcp_syncookies = 1;
3152         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3153         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3154         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3155         net->ipv4.sysctl_tcp_orphan_retries = 0;
3156         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3157         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3158         net->ipv4.sysctl_tcp_tw_reuse = 2;
3159         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3160
3161         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3162         tcp_set_hashinfo(net);
3163
3164         net->ipv4.sysctl_tcp_sack = 1;
3165         net->ipv4.sysctl_tcp_window_scaling = 1;
3166         net->ipv4.sysctl_tcp_timestamps = 1;
3167         net->ipv4.sysctl_tcp_early_retrans = 3;
3168         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3169         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3170         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3171         net->ipv4.sysctl_tcp_max_reordering = 300;
3172         net->ipv4.sysctl_tcp_dsack = 1;
3173         net->ipv4.sysctl_tcp_app_win = 31;
3174         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3175         net->ipv4.sysctl_tcp_frto = 2;
3176         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3177         /* This limits the percentage of the congestion window which we
3178          * will allow a single TSO frame to consume.  Building TSO frames
3179          * which are too large can cause TCP streams to be bursty.
3180          */
3181         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3182         /* Default TSQ limit of 16 TSO segments */
3183         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3184
3185         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3186         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3187
3188         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3189         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3190         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3191         net->ipv4.sysctl_tcp_autocorking = 1;
3192         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3193         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3194         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3195         if (net != &init_net) {
3196                 memcpy(net->ipv4.sysctl_tcp_rmem,
3197                        init_net.ipv4.sysctl_tcp_rmem,
3198                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3199                 memcpy(net->ipv4.sysctl_tcp_wmem,
3200                        init_net.ipv4.sysctl_tcp_wmem,
3201                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3202         }
3203         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3204         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3205         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3206         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3207         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3208         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3209
3210         /* Reno is always built in */
3211         if (!net_eq(net, &init_net) &&
3212             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3213                                init_net.ipv4.tcp_congestion_control->owner))
3214                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3215         else
3216                 net->ipv4.tcp_congestion_control = &tcp_reno;
3217
3218         return 0;
3219 }
3220
3221 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3222 {
3223         struct net *net;
3224
3225         tcp_twsk_purge(net_exit_list, AF_INET);
3226
3227         list_for_each_entry(net, net_exit_list, exit_list) {
3228                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3229                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3230                 tcp_fastopen_ctx_destroy(net);
3231         }
3232 }
3233
3234 static struct pernet_operations __net_initdata tcp_sk_ops = {
3235        .init       = tcp_sk_init,
3236        .exit       = tcp_sk_exit,
3237        .exit_batch = tcp_sk_exit_batch,
3238 };
3239
3240 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3241 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3242                      struct sock_common *sk_common, uid_t uid)
3243
3244 #define INIT_BATCH_SZ 16
3245
3246 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3247 {
3248         struct bpf_tcp_iter_state *iter = priv_data;
3249         int err;
3250
3251         err = bpf_iter_init_seq_net(priv_data, aux);
3252         if (err)
3253                 return err;
3254
3255         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3256         if (err) {
3257                 bpf_iter_fini_seq_net(priv_data);
3258                 return err;
3259         }
3260
3261         return 0;
3262 }
3263
3264 static void bpf_iter_fini_tcp(void *priv_data)
3265 {
3266         struct bpf_tcp_iter_state *iter = priv_data;
3267
3268         bpf_iter_fini_seq_net(priv_data);
3269         kvfree(iter->batch);
3270 }
3271
3272 static const struct bpf_iter_seq_info tcp_seq_info = {
3273         .seq_ops                = &bpf_iter_tcp_seq_ops,
3274         .init_seq_private       = bpf_iter_init_tcp,
3275         .fini_seq_private       = bpf_iter_fini_tcp,
3276         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3277 };
3278
3279 static const struct bpf_func_proto *
3280 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3281                             const struct bpf_prog *prog)
3282 {
3283         switch (func_id) {
3284         case BPF_FUNC_setsockopt:
3285                 return &bpf_sk_setsockopt_proto;
3286         case BPF_FUNC_getsockopt:
3287                 return &bpf_sk_getsockopt_proto;
3288         default:
3289                 return NULL;
3290         }
3291 }
3292
3293 static struct bpf_iter_reg tcp_reg_info = {
3294         .target                 = "tcp",
3295         .ctx_arg_info_size      = 1,
3296         .ctx_arg_info           = {
3297                 { offsetof(struct bpf_iter__tcp, sk_common),
3298                   PTR_TO_BTF_ID_OR_NULL },
3299         },
3300         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3301         .seq_info               = &tcp_seq_info,
3302 };
3303
3304 static void __init bpf_iter_register(void)
3305 {
3306         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3307         if (bpf_iter_reg_target(&tcp_reg_info))
3308                 pr_warn("Warning: could not register bpf iterator tcp\n");
3309 }
3310
3311 #endif
3312
3313 void __init tcp_v4_init(void)
3314 {
3315         int cpu, res;
3316
3317         for_each_possible_cpu(cpu) {
3318                 struct sock *sk;
3319
3320                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3321                                            IPPROTO_TCP, &init_net);
3322                 if (res)
3323                         panic("Failed to create the TCP control socket.\n");
3324                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3325
3326                 /* Please enforce IP_DF and IPID==0 for RST and
3327                  * ACK sent in SYN-RECV and TIME-WAIT state.
3328                  */
3329                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3330
3331                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3332         }
3333         if (register_pernet_subsys(&tcp_sk_ops))
3334                 panic("Failed to create the TCP control socket.\n");
3335
3336 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3337         bpf_iter_register();
3338 #endif
3339 }