1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 tcp_hdr(skb)->source);
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
191 if (addr_len < sizeof(struct sockaddr_in))
194 sock_owned_by_me(sk);
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 struct inet_timewait_death_row *tcp_death_row;
205 __be32 daddr, nexthop, prev_sk_rcv_saddr;
206 struct inet_sock *inet = inet_sk(sk);
207 struct tcp_sock *tp = tcp_sk(sk);
208 struct ip_options_rcu *inet_opt;
209 struct net *net = sock_net(sk);
210 __be16 orig_sport, orig_dport;
215 if (addr_len < sizeof(struct sockaddr_in))
218 if (usin->sin_family != AF_INET)
219 return -EAFNOSUPPORT;
221 nexthop = daddr = usin->sin_addr.s_addr;
222 inet_opt = rcu_dereference_protected(inet->inet_opt,
223 lockdep_sock_is_held(sk));
224 if (inet_opt && inet_opt->opt.srr) {
227 nexthop = inet_opt->opt.faddr;
230 orig_sport = inet->inet_sport;
231 orig_dport = usin->sin_port;
232 fl4 = &inet->cork.fl.u.ip4;
233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
238 if (err == -ENETUNREACH)
239 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
243 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
248 if (!inet_opt || !inet_opt->opt.srr)
251 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
253 if (!inet->inet_saddr) {
254 if (inet_csk(sk)->icsk_bind2_hash) {
255 prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
256 sk, net, inet->inet_num);
257 prev_sk_rcv_saddr = sk->sk_rcv_saddr;
259 inet->inet_saddr = fl4->saddr;
262 sk_rcv_saddr_set(sk, inet->inet_saddr);
264 if (prev_addr_hashbucket) {
265 err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
267 inet->inet_saddr = 0;
268 sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
274 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
275 /* Reset inherited state */
276 tp->rx_opt.ts_recent = 0;
277 tp->rx_opt.ts_recent_stamp = 0;
278 if (likely(!tp->repair))
279 WRITE_ONCE(tp->write_seq, 0);
282 inet->inet_dport = usin->sin_port;
283 sk_daddr_set(sk, daddr);
285 inet_csk(sk)->icsk_ext_hdr_len = 0;
287 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
289 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
291 /* Socket identity is still unknown (sport may be zero).
292 * However we set state to SYN-SENT and not releasing socket
293 * lock select source port, enter ourselves into the hash tables and
294 * complete initialization after this.
296 tcp_set_state(sk, TCP_SYN_SENT);
297 err = inet_hash_connect(tcp_death_row, sk);
303 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
304 inet->inet_sport, inet->inet_dport, sk);
310 /* OK, now commit destination to socket. */
311 sk->sk_gso_type = SKB_GSO_TCPV4;
312 sk_setup_caps(sk, &rt->dst);
315 if (likely(!tp->repair)) {
317 WRITE_ONCE(tp->write_seq,
318 secure_tcp_seq(inet->inet_saddr,
322 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
326 inet->inet_id = get_random_u16();
328 if (tcp_fastopen_defer_connect(sk, &err))
333 err = tcp_connect(sk);
342 * This unhashes the socket and releases the local port,
345 tcp_set_state(sk, TCP_CLOSE);
346 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
347 inet_reset_saddr(sk);
349 sk->sk_route_caps = 0;
350 inet->inet_dport = 0;
353 EXPORT_SYMBOL(tcp_v4_connect);
356 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
357 * It can be called through tcp_release_cb() if socket was owned by user
358 * at the time tcp_v4_err() was called to handle ICMP message.
360 void tcp_v4_mtu_reduced(struct sock *sk)
362 struct inet_sock *inet = inet_sk(sk);
363 struct dst_entry *dst;
366 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
368 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
369 dst = inet_csk_update_pmtu(sk, mtu);
373 /* Something is about to be wrong... Remember soft error
374 * for the case, if this connection will not able to recover.
376 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
377 sk->sk_err_soft = EMSGSIZE;
381 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
382 ip_sk_accept_pmtu(sk) &&
383 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
384 tcp_sync_mss(sk, mtu);
386 /* Resend the TCP packet because it's
387 * clear that the old packet has been
388 * dropped. This is the new "fast" path mtu
391 tcp_simple_retransmit(sk);
392 } /* else let the usual retransmit timer handle it */
394 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
396 static void do_redirect(struct sk_buff *skb, struct sock *sk)
398 struct dst_entry *dst = __sk_dst_check(sk, 0);
401 dst->ops->redirect(dst, sk, skb);
405 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
406 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
408 struct request_sock *req = inet_reqsk(sk);
409 struct net *net = sock_net(sk);
411 /* ICMPs are not backlogged, hence we cannot get
412 * an established socket here.
414 if (seq != tcp_rsk(req)->snt_isn) {
415 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
418 * Still in SYN_RECV, just remove it silently.
419 * There is no good way to pass the error to the newly
420 * created socket, and POSIX does not want network
421 * errors returned from accept().
423 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
424 tcp_listendrop(req->rsk_listener);
428 EXPORT_SYMBOL(tcp_req_err);
430 /* TCP-LD (RFC 6069) logic */
431 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
433 struct inet_connection_sock *icsk = inet_csk(sk);
434 struct tcp_sock *tp = tcp_sk(sk);
439 if (sock_owned_by_user(sk))
442 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
446 skb = tcp_rtx_queue_head(sk);
447 if (WARN_ON_ONCE(!skb))
450 icsk->icsk_backoff--;
451 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
452 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
454 tcp_mstamp_refresh(tp);
455 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
456 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
459 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
460 remaining, TCP_RTO_MAX);
462 /* RTO revert clocked out retransmission.
463 * Will retransmit now.
465 tcp_retransmit_timer(sk);
468 EXPORT_SYMBOL(tcp_ld_RTO_revert);
471 * This routine is called by the ICMP module when it gets some
472 * sort of error condition. If err < 0 then the socket should
473 * be closed and the error returned to the user. If err > 0
474 * it's just the icmp type << 8 | icmp code. After adjustment
475 * header points to the first 8 bytes of the tcp header. We need
476 * to find the appropriate port.
478 * The locking strategy used here is very "optimistic". When
479 * someone else accesses the socket the ICMP is just dropped
480 * and for some paths there is no check at all.
481 * A more general error queue to queue errors for later handling
482 * is probably better.
486 int tcp_v4_err(struct sk_buff *skb, u32 info)
488 const struct iphdr *iph = (const struct iphdr *)skb->data;
489 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
491 struct inet_sock *inet;
492 const int type = icmp_hdr(skb)->type;
493 const int code = icmp_hdr(skb)->code;
495 struct request_sock *fastopen;
498 struct net *net = dev_net(skb->dev);
500 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
501 iph->daddr, th->dest, iph->saddr,
502 ntohs(th->source), inet_iif(skb), 0);
504 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
507 if (sk->sk_state == TCP_TIME_WAIT) {
508 inet_twsk_put(inet_twsk(sk));
511 seq = ntohl(th->seq);
512 if (sk->sk_state == TCP_NEW_SYN_RECV) {
513 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
514 type == ICMP_TIME_EXCEEDED ||
515 (type == ICMP_DEST_UNREACH &&
516 (code == ICMP_NET_UNREACH ||
517 code == ICMP_HOST_UNREACH)));
522 /* If too many ICMPs get dropped on busy
523 * servers this needs to be solved differently.
524 * We do take care of PMTU discovery (RFC1191) special case :
525 * we can receive locally generated ICMP messages while socket is held.
527 if (sock_owned_by_user(sk)) {
528 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
529 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
531 if (sk->sk_state == TCP_CLOSE)
534 if (static_branch_unlikely(&ip4_min_ttl)) {
535 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
536 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
537 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
543 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
544 fastopen = rcu_dereference(tp->fastopen_rsk);
545 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
546 if (sk->sk_state != TCP_LISTEN &&
547 !between(seq, snd_una, tp->snd_nxt)) {
548 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
554 if (!sock_owned_by_user(sk))
555 do_redirect(skb, sk);
557 case ICMP_SOURCE_QUENCH:
558 /* Just silently ignore these. */
560 case ICMP_PARAMETERPROB:
563 case ICMP_DEST_UNREACH:
564 if (code > NR_ICMP_UNREACH)
567 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
568 /* We are not interested in TCP_LISTEN and open_requests
569 * (SYN-ACKs send out by Linux are always <576bytes so
570 * they should go through unfragmented).
572 if (sk->sk_state == TCP_LISTEN)
575 WRITE_ONCE(tp->mtu_info, info);
576 if (!sock_owned_by_user(sk)) {
577 tcp_v4_mtu_reduced(sk);
579 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
585 err = icmp_err_convert[code].errno;
586 /* check if this ICMP message allows revert of backoff.
590 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
591 tcp_ld_RTO_revert(sk, seq);
593 case ICMP_TIME_EXCEEDED:
600 switch (sk->sk_state) {
603 /* Only in fast or simultaneous open. If a fast open socket is
604 * already accepted it is treated as a connected one below.
606 if (fastopen && !fastopen->sk)
609 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
611 if (!sock_owned_by_user(sk)) {
618 sk->sk_err_soft = err;
623 /* If we've already connected we will keep trying
624 * until we time out, or the user gives up.
626 * rfc1122 4.2.3.9 allows to consider as hard errors
627 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
628 * but it is obsoleted by pmtu discovery).
630 * Note, that in modern internet, where routing is unreliable
631 * and in each dark corner broken firewalls sit, sending random
632 * errors ordered by their masters even this two messages finally lose
633 * their original sense (even Linux sends invalid PORT_UNREACHs)
635 * Now we are in compliance with RFCs.
640 if (!sock_owned_by_user(sk) && inet->recverr) {
643 } else { /* Only an error on timeout */
644 sk->sk_err_soft = err;
653 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
655 struct tcphdr *th = tcp_hdr(skb);
657 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
658 skb->csum_start = skb_transport_header(skb) - skb->head;
659 skb->csum_offset = offsetof(struct tcphdr, check);
662 /* This routine computes an IPv4 TCP checksum. */
663 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
665 const struct inet_sock *inet = inet_sk(sk);
667 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
669 EXPORT_SYMBOL(tcp_v4_send_check);
672 * This routine will send an RST to the other tcp.
674 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
676 * Answer: if a packet caused RST, it is not for a socket
677 * existing in our system, if it is matched to a socket,
678 * it is just duplicate segment or bug in other side's TCP.
679 * So that we build reply only basing on parameters
680 * arrived with segment.
681 * Exception: precedence violation. We do not implement it in any case.
684 #ifdef CONFIG_TCP_MD5SIG
685 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
687 #define OPTION_BYTES sizeof(__be32)
690 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
692 const struct tcphdr *th = tcp_hdr(skb);
695 __be32 opt[OPTION_BYTES / sizeof(__be32)];
697 struct ip_reply_arg arg;
698 #ifdef CONFIG_TCP_MD5SIG
699 struct tcp_md5sig_key *key = NULL;
700 const __u8 *hash_location = NULL;
701 unsigned char newhash[16];
703 struct sock *sk1 = NULL;
705 u64 transmit_time = 0;
709 /* Never send a reset in response to a reset. */
713 /* If sk not NULL, it means we did a successful lookup and incoming
714 * route had to be correct. prequeue might have dropped our dst.
716 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
719 /* Swap the send and the receive. */
720 memset(&rep, 0, sizeof(rep));
721 rep.th.dest = th->source;
722 rep.th.source = th->dest;
723 rep.th.doff = sizeof(struct tcphdr) / 4;
727 rep.th.seq = th->ack_seq;
730 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
731 skb->len - (th->doff << 2));
734 memset(&arg, 0, sizeof(arg));
735 arg.iov[0].iov_base = (unsigned char *)&rep;
736 arg.iov[0].iov_len = sizeof(rep.th);
738 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
739 #ifdef CONFIG_TCP_MD5SIG
741 hash_location = tcp_parse_md5sig_option(th);
742 if (sk && sk_fullsock(sk)) {
743 const union tcp_md5_addr *addr;
746 /* sdif set, means packet ingressed via a device
747 * in an L3 domain and inet_iif is set to it.
749 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
750 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
751 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
752 } else if (hash_location) {
753 const union tcp_md5_addr *addr;
754 int sdif = tcp_v4_sdif(skb);
755 int dif = inet_iif(skb);
759 * active side is lost. Try to find listening socket through
760 * source port, and then find md5 key through listening socket.
761 * we are not loose security here:
762 * Incoming packet is checked with md5 hash with finding key,
763 * no RST generated if md5 hash doesn't match.
765 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
766 NULL, 0, ip_hdr(skb)->saddr,
767 th->source, ip_hdr(skb)->daddr,
768 ntohs(th->source), dif, sdif);
769 /* don't send rst if it can't find key */
773 /* sdif set, means packet ingressed via a device
774 * in an L3 domain and dif is set to it.
776 l3index = sdif ? dif : 0;
777 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
778 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
783 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
784 if (genhash || memcmp(hash_location, newhash, 16) != 0)
790 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
792 (TCPOPT_MD5SIG << 8) |
794 /* Update length and the length the header thinks exists */
795 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
796 rep.th.doff = arg.iov[0].iov_len / 4;
798 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
799 key, ip_hdr(skb)->saddr,
800 ip_hdr(skb)->daddr, &rep.th);
803 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
804 if (rep.opt[0] == 0) {
805 __be32 mrst = mptcp_reset_option(skb);
809 arg.iov[0].iov_len += sizeof(mrst);
810 rep.th.doff = arg.iov[0].iov_len / 4;
814 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
815 ip_hdr(skb)->saddr, /* XXX */
816 arg.iov[0].iov_len, IPPROTO_TCP, 0);
817 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
818 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
820 /* When socket is gone, all binding information is lost.
821 * routing might fail in this case. No choice here, if we choose to force
822 * input interface, we will misroute in case of asymmetric route.
825 arg.bound_dev_if = sk->sk_bound_dev_if;
827 trace_tcp_send_reset(sk, skb);
830 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
831 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
833 arg.tos = ip_hdr(skb)->tos;
834 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
836 ctl_sk = this_cpu_read(ipv4_tcp_sk);
837 sock_net_set(ctl_sk, net);
839 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
840 inet_twsk(sk)->tw_mark : sk->sk_mark;
841 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
842 inet_twsk(sk)->tw_priority : sk->sk_priority;
843 transmit_time = tcp_transmit_time(sk);
844 xfrm_sk_clone_policy(ctl_sk, sk);
846 ip_send_unicast_reply(ctl_sk,
847 skb, &TCP_SKB_CB(skb)->header.h4.opt,
848 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
849 &arg, arg.iov[0].iov_len,
853 xfrm_sk_free_policy(ctl_sk);
854 sock_net_set(ctl_sk, &init_net);
855 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
856 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
859 #ifdef CONFIG_TCP_MD5SIG
865 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
866 outside socket context is ugly, certainly. What can I do?
869 static void tcp_v4_send_ack(const struct sock *sk,
870 struct sk_buff *skb, u32 seq, u32 ack,
871 u32 win, u32 tsval, u32 tsecr, int oif,
872 struct tcp_md5sig_key *key,
873 int reply_flags, u8 tos)
875 const struct tcphdr *th = tcp_hdr(skb);
878 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
879 #ifdef CONFIG_TCP_MD5SIG
880 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
884 struct net *net = sock_net(sk);
885 struct ip_reply_arg arg;
889 memset(&rep.th, 0, sizeof(struct tcphdr));
890 memset(&arg, 0, sizeof(arg));
892 arg.iov[0].iov_base = (unsigned char *)&rep;
893 arg.iov[0].iov_len = sizeof(rep.th);
895 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
896 (TCPOPT_TIMESTAMP << 8) |
898 rep.opt[1] = htonl(tsval);
899 rep.opt[2] = htonl(tsecr);
900 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
903 /* Swap the send and the receive. */
904 rep.th.dest = th->source;
905 rep.th.source = th->dest;
906 rep.th.doff = arg.iov[0].iov_len / 4;
907 rep.th.seq = htonl(seq);
908 rep.th.ack_seq = htonl(ack);
910 rep.th.window = htons(win);
912 #ifdef CONFIG_TCP_MD5SIG
914 int offset = (tsecr) ? 3 : 0;
916 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
918 (TCPOPT_MD5SIG << 8) |
920 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
921 rep.th.doff = arg.iov[0].iov_len/4;
923 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
924 key, ip_hdr(skb)->saddr,
925 ip_hdr(skb)->daddr, &rep.th);
928 arg.flags = reply_flags;
929 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
930 ip_hdr(skb)->saddr, /* XXX */
931 arg.iov[0].iov_len, IPPROTO_TCP, 0);
932 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
934 arg.bound_dev_if = oif;
936 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
938 ctl_sk = this_cpu_read(ipv4_tcp_sk);
939 sock_net_set(ctl_sk, net);
940 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
941 inet_twsk(sk)->tw_mark : sk->sk_mark;
942 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
943 inet_twsk(sk)->tw_priority : sk->sk_priority;
944 transmit_time = tcp_transmit_time(sk);
945 ip_send_unicast_reply(ctl_sk,
946 skb, &TCP_SKB_CB(skb)->header.h4.opt,
947 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
948 &arg, arg.iov[0].iov_len,
952 sock_net_set(ctl_sk, &init_net);
953 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
957 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
959 struct inet_timewait_sock *tw = inet_twsk(sk);
960 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
962 tcp_v4_send_ack(sk, skb,
963 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
964 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
965 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
968 tcp_twsk_md5_key(tcptw),
969 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
976 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
977 struct request_sock *req)
979 const union tcp_md5_addr *addr;
982 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
983 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
985 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
989 * The window field (SEG.WND) of every outgoing segment, with the
990 * exception of <SYN> segments, MUST be right-shifted by
991 * Rcv.Wind.Shift bits:
993 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
994 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
995 tcp_v4_send_ack(sk, skb, seq,
996 tcp_rsk(req)->rcv_nxt,
997 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
998 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
1001 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
1002 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1007 * Send a SYN-ACK after having received a SYN.
1008 * This still operates on a request_sock only, not on a big
1011 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1013 struct request_sock *req,
1014 struct tcp_fastopen_cookie *foc,
1015 enum tcp_synack_type synack_type,
1016 struct sk_buff *syn_skb)
1018 const struct inet_request_sock *ireq = inet_rsk(req);
1021 struct sk_buff *skb;
1024 /* First, grab a route. */
1025 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1028 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1031 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1033 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1034 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1035 (inet_sk(sk)->tos & INET_ECN_MASK) :
1038 if (!INET_ECN_is_capable(tos) &&
1039 tcp_bpf_ca_needs_ecn((struct sock *)req))
1040 tos |= INET_ECN_ECT_0;
1043 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1045 rcu_dereference(ireq->ireq_opt),
1048 err = net_xmit_eval(err);
1055 * IPv4 request_sock destructor.
1057 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1059 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1062 #ifdef CONFIG_TCP_MD5SIG
1064 * RFC2385 MD5 checksumming requires a mapping of
1065 * IP address->MD5 Key.
1066 * We need to maintain these in the sk structure.
1069 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1070 EXPORT_SYMBOL(tcp_md5_needed);
1072 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1077 /* l3index always overrides non-l3index */
1078 if (old->l3index && new->l3index == 0)
1080 if (old->l3index == 0 && new->l3index)
1083 return old->prefixlen < new->prefixlen;
1086 /* Find the Key structure for an address. */
1087 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1088 const union tcp_md5_addr *addr,
1091 const struct tcp_sock *tp = tcp_sk(sk);
1092 struct tcp_md5sig_key *key;
1093 const struct tcp_md5sig_info *md5sig;
1095 struct tcp_md5sig_key *best_match = NULL;
1098 /* caller either holds rcu_read_lock() or socket lock */
1099 md5sig = rcu_dereference_check(tp->md5sig_info,
1100 lockdep_sock_is_held(sk));
1104 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105 lockdep_sock_is_held(sk)) {
1106 if (key->family != family)
1108 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1110 if (family == AF_INET) {
1111 mask = inet_make_mask(key->prefixlen);
1112 match = (key->addr.a4.s_addr & mask) ==
1113 (addr->a4.s_addr & mask);
1114 #if IS_ENABLED(CONFIG_IPV6)
1115 } else if (family == AF_INET6) {
1116 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1123 if (match && better_md5_match(best_match, key))
1128 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1130 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1131 const union tcp_md5_addr *addr,
1132 int family, u8 prefixlen,
1133 int l3index, u8 flags)
1135 const struct tcp_sock *tp = tcp_sk(sk);
1136 struct tcp_md5sig_key *key;
1137 unsigned int size = sizeof(struct in_addr);
1138 const struct tcp_md5sig_info *md5sig;
1140 /* caller either holds rcu_read_lock() or socket lock */
1141 md5sig = rcu_dereference_check(tp->md5sig_info,
1142 lockdep_sock_is_held(sk));
1145 #if IS_ENABLED(CONFIG_IPV6)
1146 if (family == AF_INET6)
1147 size = sizeof(struct in6_addr);
1149 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1150 lockdep_sock_is_held(sk)) {
1151 if (key->family != family)
1153 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1155 if (key->l3index != l3index)
1157 if (!memcmp(&key->addr, addr, size) &&
1158 key->prefixlen == prefixlen)
1164 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1165 const struct sock *addr_sk)
1167 const union tcp_md5_addr *addr;
1170 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1171 addr_sk->sk_bound_dev_if);
1172 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1173 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1175 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1177 /* This can be called on a newly created socket, from other files */
1178 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1179 int family, u8 prefixlen, int l3index, u8 flags,
1180 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1182 /* Add Key to the list */
1183 struct tcp_md5sig_key *key;
1184 struct tcp_sock *tp = tcp_sk(sk);
1185 struct tcp_md5sig_info *md5sig;
1187 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1189 /* Pre-existing entry - just update that one.
1190 * Note that the key might be used concurrently.
1191 * data_race() is telling kcsan that we do not care of
1192 * key mismatches, since changing MD5 key on live flows
1193 * can lead to packet drops.
1195 data_race(memcpy(key->key, newkey, newkeylen));
1197 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1198 * Also note that a reader could catch new key->keylen value
1199 * but old key->key[], this is the reason we use __GFP_ZERO
1200 * at sock_kmalloc() time below these lines.
1202 WRITE_ONCE(key->keylen, newkeylen);
1207 md5sig = rcu_dereference_protected(tp->md5sig_info,
1208 lockdep_sock_is_held(sk));
1210 md5sig = kmalloc(sizeof(*md5sig), gfp);
1215 INIT_HLIST_HEAD(&md5sig->head);
1216 rcu_assign_pointer(tp->md5sig_info, md5sig);
1219 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1222 if (!tcp_alloc_md5sig_pool()) {
1223 sock_kfree_s(sk, key, sizeof(*key));
1227 memcpy(key->key, newkey, newkeylen);
1228 key->keylen = newkeylen;
1229 key->family = family;
1230 key->prefixlen = prefixlen;
1231 key->l3index = l3index;
1233 memcpy(&key->addr, addr,
1234 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1235 sizeof(struct in_addr));
1236 hlist_add_head_rcu(&key->node, &md5sig->head);
1239 EXPORT_SYMBOL(tcp_md5_do_add);
1241 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1242 u8 prefixlen, int l3index, u8 flags)
1244 struct tcp_md5sig_key *key;
1246 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1249 hlist_del_rcu(&key->node);
1250 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1251 kfree_rcu(key, rcu);
1254 EXPORT_SYMBOL(tcp_md5_do_del);
1256 static void tcp_clear_md5_list(struct sock *sk)
1258 struct tcp_sock *tp = tcp_sk(sk);
1259 struct tcp_md5sig_key *key;
1260 struct hlist_node *n;
1261 struct tcp_md5sig_info *md5sig;
1263 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1265 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1266 hlist_del_rcu(&key->node);
1267 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1268 kfree_rcu(key, rcu);
1272 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1273 sockptr_t optval, int optlen)
1275 struct tcp_md5sig cmd;
1276 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1277 const union tcp_md5_addr *addr;
1282 if (optlen < sizeof(cmd))
1285 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1288 if (sin->sin_family != AF_INET)
1291 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1293 if (optname == TCP_MD5SIG_EXT &&
1294 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1295 prefixlen = cmd.tcpm_prefixlen;
1300 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1301 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1302 struct net_device *dev;
1305 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1306 if (dev && netif_is_l3_master(dev))
1307 l3index = dev->ifindex;
1311 /* ok to reference set/not set outside of rcu;
1312 * right now device MUST be an L3 master
1314 if (!dev || !l3index)
1318 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1320 if (!cmd.tcpm_keylen)
1321 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1323 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1326 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1327 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1330 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1331 __be32 daddr, __be32 saddr,
1332 const struct tcphdr *th, int nbytes)
1334 struct tcp4_pseudohdr *bp;
1335 struct scatterlist sg;
1342 bp->protocol = IPPROTO_TCP;
1343 bp->len = cpu_to_be16(nbytes);
1345 _th = (struct tcphdr *)(bp + 1);
1346 memcpy(_th, th, sizeof(*th));
1349 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1350 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1351 sizeof(*bp) + sizeof(*th));
1352 return crypto_ahash_update(hp->md5_req);
1355 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1356 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1358 struct tcp_md5sig_pool *hp;
1359 struct ahash_request *req;
1361 hp = tcp_get_md5sig_pool();
1363 goto clear_hash_noput;
1366 if (crypto_ahash_init(req))
1368 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1370 if (tcp_md5_hash_key(hp, key))
1372 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1373 if (crypto_ahash_final(req))
1376 tcp_put_md5sig_pool();
1380 tcp_put_md5sig_pool();
1382 memset(md5_hash, 0, 16);
1386 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1387 const struct sock *sk,
1388 const struct sk_buff *skb)
1390 struct tcp_md5sig_pool *hp;
1391 struct ahash_request *req;
1392 const struct tcphdr *th = tcp_hdr(skb);
1393 __be32 saddr, daddr;
1395 if (sk) { /* valid for establish/request sockets */
1396 saddr = sk->sk_rcv_saddr;
1397 daddr = sk->sk_daddr;
1399 const struct iphdr *iph = ip_hdr(skb);
1404 hp = tcp_get_md5sig_pool();
1406 goto clear_hash_noput;
1409 if (crypto_ahash_init(req))
1412 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1414 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1416 if (tcp_md5_hash_key(hp, key))
1418 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1419 if (crypto_ahash_final(req))
1422 tcp_put_md5sig_pool();
1426 tcp_put_md5sig_pool();
1428 memset(md5_hash, 0, 16);
1431 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1435 static void tcp_v4_init_req(struct request_sock *req,
1436 const struct sock *sk_listener,
1437 struct sk_buff *skb)
1439 struct inet_request_sock *ireq = inet_rsk(req);
1440 struct net *net = sock_net(sk_listener);
1442 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1443 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1444 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1447 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1448 struct sk_buff *skb,
1450 struct request_sock *req)
1452 tcp_v4_init_req(req, sk, skb);
1454 if (security_inet_conn_request(sk, skb, req))
1457 return inet_csk_route_req(sk, &fl->u.ip4, req);
1460 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1462 .obj_size = sizeof(struct tcp_request_sock),
1463 .rtx_syn_ack = tcp_rtx_synack,
1464 .send_ack = tcp_v4_reqsk_send_ack,
1465 .destructor = tcp_v4_reqsk_destructor,
1466 .send_reset = tcp_v4_send_reset,
1467 .syn_ack_timeout = tcp_syn_ack_timeout,
1470 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1471 .mss_clamp = TCP_MSS_DEFAULT,
1472 #ifdef CONFIG_TCP_MD5SIG
1473 .req_md5_lookup = tcp_v4_md5_lookup,
1474 .calc_md5_hash = tcp_v4_md5_hash_skb,
1476 #ifdef CONFIG_SYN_COOKIES
1477 .cookie_init_seq = cookie_v4_init_sequence,
1479 .route_req = tcp_v4_route_req,
1480 .init_seq = tcp_v4_init_seq,
1481 .init_ts_off = tcp_v4_init_ts_off,
1482 .send_synack = tcp_v4_send_synack,
1485 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1487 /* Never answer to SYNs send to broadcast or multicast */
1488 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1491 return tcp_conn_request(&tcp_request_sock_ops,
1492 &tcp_request_sock_ipv4_ops, sk, skb);
1498 EXPORT_SYMBOL(tcp_v4_conn_request);
1502 * The three way handshake has completed - we got a valid synack -
1503 * now create the new socket.
1505 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1506 struct request_sock *req,
1507 struct dst_entry *dst,
1508 struct request_sock *req_unhash,
1511 struct inet_request_sock *ireq;
1512 bool found_dup_sk = false;
1513 struct inet_sock *newinet;
1514 struct tcp_sock *newtp;
1516 #ifdef CONFIG_TCP_MD5SIG
1517 const union tcp_md5_addr *addr;
1518 struct tcp_md5sig_key *key;
1521 struct ip_options_rcu *inet_opt;
1523 if (sk_acceptq_is_full(sk))
1526 newsk = tcp_create_openreq_child(sk, req, skb);
1530 newsk->sk_gso_type = SKB_GSO_TCPV4;
1531 inet_sk_rx_dst_set(newsk, skb);
1533 newtp = tcp_sk(newsk);
1534 newinet = inet_sk(newsk);
1535 ireq = inet_rsk(req);
1536 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1537 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1538 newsk->sk_bound_dev_if = ireq->ir_iif;
1539 newinet->inet_saddr = ireq->ir_loc_addr;
1540 inet_opt = rcu_dereference(ireq->ireq_opt);
1541 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1542 newinet->mc_index = inet_iif(skb);
1543 newinet->mc_ttl = ip_hdr(skb)->ttl;
1544 newinet->rcv_tos = ip_hdr(skb)->tos;
1545 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1547 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1548 newinet->inet_id = get_random_u16();
1550 /* Set ToS of the new socket based upon the value of incoming SYN.
1551 * ECT bits are set later in tcp_init_transfer().
1553 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1554 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1557 dst = inet_csk_route_child_sock(sk, newsk, req);
1561 /* syncookie case : see end of cookie_v4_check() */
1563 sk_setup_caps(newsk, dst);
1565 tcp_ca_openreq_child(newsk, dst);
1567 tcp_sync_mss(newsk, dst_mtu(dst));
1568 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1570 tcp_initialize_rcv_mss(newsk);
1572 #ifdef CONFIG_TCP_MD5SIG
1573 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1574 /* Copy over the MD5 key from the original socket */
1575 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1576 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1579 * We're using one, so create a matching key
1580 * on the newsk structure. If we fail to get
1581 * memory, then we end up not copying the key
1584 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1585 key->key, key->keylen, GFP_ATOMIC);
1586 sk_gso_disable(newsk);
1590 if (__inet_inherit_port(sk, newsk) < 0)
1592 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1594 if (likely(*own_req)) {
1595 tcp_move_syn(newtp, req);
1596 ireq->ireq_opt = NULL;
1598 newinet->inet_opt = NULL;
1600 if (!req_unhash && found_dup_sk) {
1601 /* This code path should only be executed in the
1602 * syncookie case only
1604 bh_unlock_sock(newsk);
1612 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1619 newinet->inet_opt = NULL;
1620 inet_csk_prepare_forced_close(newsk);
1624 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1626 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1628 #ifdef CONFIG_SYN_COOKIES
1629 const struct tcphdr *th = tcp_hdr(skb);
1632 sk = cookie_v4_check(sk, skb);
1637 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1638 struct tcphdr *th, u32 *cookie)
1641 #ifdef CONFIG_SYN_COOKIES
1642 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1643 &tcp_request_sock_ipv4_ops, sk, th);
1645 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1646 tcp_synq_overflow(sk);
1652 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1654 /* The socket must have it's spinlock held when we get
1655 * here, unless it is a TCP_LISTEN socket.
1657 * We have a potential double-lock case here, so even when
1658 * doing backlog processing we use the BH locking scheme.
1659 * This is because we cannot sleep with the original spinlock
1662 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1664 enum skb_drop_reason reason;
1667 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1668 struct dst_entry *dst;
1670 dst = rcu_dereference_protected(sk->sk_rx_dst,
1671 lockdep_sock_is_held(sk));
1673 sock_rps_save_rxhash(sk, skb);
1674 sk_mark_napi_id(sk, skb);
1676 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1677 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1679 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1683 tcp_rcv_established(sk, skb);
1687 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1688 if (tcp_checksum_complete(skb))
1691 if (sk->sk_state == TCP_LISTEN) {
1692 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1697 if (tcp_child_process(sk, nsk, skb)) {
1704 sock_rps_save_rxhash(sk, skb);
1706 if (tcp_rcv_state_process(sk, skb)) {
1713 tcp_v4_send_reset(rsk, skb);
1715 kfree_skb_reason(skb, reason);
1716 /* Be careful here. If this function gets more complicated and
1717 * gcc suffers from register pressure on the x86, sk (in %ebx)
1718 * might be destroyed here. This current version compiles correctly,
1719 * but you have been warned.
1724 reason = SKB_DROP_REASON_TCP_CSUM;
1725 trace_tcp_bad_csum(skb);
1726 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1727 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1730 EXPORT_SYMBOL(tcp_v4_do_rcv);
1732 int tcp_v4_early_demux(struct sk_buff *skb)
1734 struct net *net = dev_net(skb->dev);
1735 const struct iphdr *iph;
1736 const struct tcphdr *th;
1739 if (skb->pkt_type != PACKET_HOST)
1742 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1748 if (th->doff < sizeof(struct tcphdr) / 4)
1751 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1752 iph->saddr, th->source,
1753 iph->daddr, ntohs(th->dest),
1754 skb->skb_iif, inet_sdif(skb));
1757 skb->destructor = sock_edemux;
1758 if (sk_fullsock(sk)) {
1759 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1762 dst = dst_check(dst, 0);
1764 sk->sk_rx_dst_ifindex == skb->skb_iif)
1765 skb_dst_set_noref(skb, dst);
1771 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1772 enum skb_drop_reason *reason)
1774 u32 limit, tail_gso_size, tail_gso_segs;
1775 struct skb_shared_info *shinfo;
1776 const struct tcphdr *th;
1777 struct tcphdr *thtail;
1778 struct sk_buff *tail;
1779 unsigned int hdrlen;
1785 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1786 * we can fix skb->truesize to its real value to avoid future drops.
1787 * This is valid because skb is not yet charged to the socket.
1788 * It has been noticed pure SACK packets were sometimes dropped
1789 * (if cooked by drivers without copybreak feature).
1795 if (unlikely(tcp_checksum_complete(skb))) {
1797 trace_tcp_bad_csum(skb);
1798 *reason = SKB_DROP_REASON_TCP_CSUM;
1799 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1800 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1804 /* Attempt coalescing to last skb in backlog, even if we are
1806 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1808 th = (const struct tcphdr *)skb->data;
1809 hdrlen = th->doff * 4;
1811 tail = sk->sk_backlog.tail;
1814 thtail = (struct tcphdr *)tail->data;
1816 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1817 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1818 ((TCP_SKB_CB(tail)->tcp_flags |
1819 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1820 !((TCP_SKB_CB(tail)->tcp_flags &
1821 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1822 ((TCP_SKB_CB(tail)->tcp_flags ^
1823 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1824 #ifdef CONFIG_TLS_DEVICE
1825 tail->decrypted != skb->decrypted ||
1827 thtail->doff != th->doff ||
1828 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1831 __skb_pull(skb, hdrlen);
1833 shinfo = skb_shinfo(skb);
1834 gso_size = shinfo->gso_size ?: skb->len;
1835 gso_segs = shinfo->gso_segs ?: 1;
1837 shinfo = skb_shinfo(tail);
1838 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1839 tail_gso_segs = shinfo->gso_segs ?: 1;
1841 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1842 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1844 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1845 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1846 thtail->window = th->window;
1849 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1850 * thtail->fin, so that the fast path in tcp_rcv_established()
1851 * is not entered if we append a packet with a FIN.
1852 * SYN, RST, URG are not present.
1853 * ACK is set on both packets.
1854 * PSH : we do not really care in TCP stack,
1855 * at least for 'GRO' packets.
1857 thtail->fin |= th->fin;
1858 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1860 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1861 TCP_SKB_CB(tail)->has_rxtstamp = true;
1862 tail->tstamp = skb->tstamp;
1863 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1866 /* Not as strict as GRO. We only need to carry mss max value */
1867 shinfo->gso_size = max(gso_size, tail_gso_size);
1868 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1870 sk->sk_backlog.len += delta;
1871 __NET_INC_STATS(sock_net(sk),
1872 LINUX_MIB_TCPBACKLOGCOALESCE);
1873 kfree_skb_partial(skb, fragstolen);
1876 __skb_push(skb, hdrlen);
1879 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1881 /* Only socket owner can try to collapse/prune rx queues
1882 * to reduce memory overhead, so add a little headroom here.
1883 * Few sockets backlog are possibly concurrently non empty.
1887 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1889 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1890 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1895 EXPORT_SYMBOL(tcp_add_backlog);
1897 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1899 struct tcphdr *th = (struct tcphdr *)skb->data;
1901 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1903 EXPORT_SYMBOL(tcp_filter);
1905 static void tcp_v4_restore_cb(struct sk_buff *skb)
1907 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1908 sizeof(struct inet_skb_parm));
1911 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1912 const struct tcphdr *th)
1914 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1915 * barrier() makes sure compiler wont play fool^Waliasing games.
1917 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1918 sizeof(struct inet_skb_parm));
1921 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1922 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1923 skb->len - th->doff * 4);
1924 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1925 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1926 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1927 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1928 TCP_SKB_CB(skb)->sacked = 0;
1929 TCP_SKB_CB(skb)->has_rxtstamp =
1930 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1937 int tcp_v4_rcv(struct sk_buff *skb)
1939 struct net *net = dev_net(skb->dev);
1940 enum skb_drop_reason drop_reason;
1941 int sdif = inet_sdif(skb);
1942 int dif = inet_iif(skb);
1943 const struct iphdr *iph;
1944 const struct tcphdr *th;
1949 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1950 if (skb->pkt_type != PACKET_HOST)
1953 /* Count it even if it's bad */
1954 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1956 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1959 th = (const struct tcphdr *)skb->data;
1961 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1962 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1965 if (!pskb_may_pull(skb, th->doff * 4))
1968 /* An explanation is required here, I think.
1969 * Packet length and doff are validated by header prediction,
1970 * provided case of th->doff==0 is eliminated.
1971 * So, we defer the checks. */
1973 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1976 th = (const struct tcphdr *)skb->data;
1979 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1980 skb, __tcp_hdrlen(th), th->source,
1981 th->dest, sdif, &refcounted);
1986 if (sk->sk_state == TCP_TIME_WAIT)
1989 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1990 struct request_sock *req = inet_reqsk(sk);
1991 bool req_stolen = false;
1994 sk = req->rsk_listener;
1995 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1996 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1998 drop_reason = tcp_inbound_md5_hash(sk, skb,
1999 &iph->saddr, &iph->daddr,
2000 AF_INET, dif, sdif);
2001 if (unlikely(drop_reason)) {
2002 sk_drops_add(sk, skb);
2006 if (tcp_checksum_complete(skb)) {
2010 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2011 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2013 inet_csk_reqsk_queue_drop_and_put(sk, req);
2017 /* reuseport_migrate_sock() has already held one sk_refcnt
2021 /* We own a reference on the listener, increase it again
2022 * as we might lose it too soon.
2028 if (!tcp_filter(sk, skb)) {
2029 th = (const struct tcphdr *)skb->data;
2031 tcp_v4_fill_cb(skb, iph, th);
2032 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2034 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2039 /* Another cpu got exclusive access to req
2040 * and created a full blown socket.
2041 * Try to feed this packet to this socket
2042 * instead of discarding it.
2044 tcp_v4_restore_cb(skb);
2048 goto discard_and_relse;
2053 tcp_v4_restore_cb(skb);
2054 } else if (tcp_child_process(sk, nsk, skb)) {
2055 tcp_v4_send_reset(nsk, skb);
2056 goto discard_and_relse;
2063 if (static_branch_unlikely(&ip4_min_ttl)) {
2064 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2065 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2066 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2067 goto discard_and_relse;
2071 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2072 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2073 goto discard_and_relse;
2076 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2077 &iph->daddr, AF_INET, dif, sdif);
2079 goto discard_and_relse;
2083 if (tcp_filter(sk, skb)) {
2084 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2085 goto discard_and_relse;
2087 th = (const struct tcphdr *)skb->data;
2089 tcp_v4_fill_cb(skb, iph, th);
2093 if (sk->sk_state == TCP_LISTEN) {
2094 ret = tcp_v4_do_rcv(sk, skb);
2095 goto put_and_return;
2098 sk_incoming_cpu_update(sk);
2100 bh_lock_sock_nested(sk);
2101 tcp_segs_in(tcp_sk(sk), skb);
2103 if (!sock_owned_by_user(sk)) {
2104 ret = tcp_v4_do_rcv(sk, skb);
2106 if (tcp_add_backlog(sk, skb, &drop_reason))
2107 goto discard_and_relse;
2118 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2119 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2122 tcp_v4_fill_cb(skb, iph, th);
2124 if (tcp_checksum_complete(skb)) {
2126 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2127 trace_tcp_bad_csum(skb);
2128 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2130 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2132 tcp_v4_send_reset(NULL, skb);
2136 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2137 /* Discard frame. */
2138 kfree_skb_reason(skb, drop_reason);
2142 sk_drops_add(sk, skb);
2148 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2149 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2150 inet_twsk_put(inet_twsk(sk));
2154 tcp_v4_fill_cb(skb, iph, th);
2156 if (tcp_checksum_complete(skb)) {
2157 inet_twsk_put(inet_twsk(sk));
2160 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2162 struct sock *sk2 = inet_lookup_listener(net,
2163 net->ipv4.tcp_death_row.hashinfo,
2164 skb, __tcp_hdrlen(th),
2165 iph->saddr, th->source,
2166 iph->daddr, th->dest,
2170 inet_twsk_deschedule_put(inet_twsk(sk));
2172 tcp_v4_restore_cb(skb);
2180 tcp_v4_timewait_ack(sk, skb);
2183 tcp_v4_send_reset(sk, skb);
2184 inet_twsk_deschedule_put(inet_twsk(sk));
2186 case TCP_TW_SUCCESS:;
2191 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2192 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2193 .twsk_unique = tcp_twsk_unique,
2194 .twsk_destructor= tcp_twsk_destructor,
2197 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2199 struct dst_entry *dst = skb_dst(skb);
2201 if (dst && dst_hold_safe(dst)) {
2202 rcu_assign_pointer(sk->sk_rx_dst, dst);
2203 sk->sk_rx_dst_ifindex = skb->skb_iif;
2206 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2208 const struct inet_connection_sock_af_ops ipv4_specific = {
2209 .queue_xmit = ip_queue_xmit,
2210 .send_check = tcp_v4_send_check,
2211 .rebuild_header = inet_sk_rebuild_header,
2212 .sk_rx_dst_set = inet_sk_rx_dst_set,
2213 .conn_request = tcp_v4_conn_request,
2214 .syn_recv_sock = tcp_v4_syn_recv_sock,
2215 .net_header_len = sizeof(struct iphdr),
2216 .setsockopt = ip_setsockopt,
2217 .getsockopt = ip_getsockopt,
2218 .addr2sockaddr = inet_csk_addr2sockaddr,
2219 .sockaddr_len = sizeof(struct sockaddr_in),
2220 .mtu_reduced = tcp_v4_mtu_reduced,
2222 EXPORT_SYMBOL(ipv4_specific);
2224 #ifdef CONFIG_TCP_MD5SIG
2225 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2226 .md5_lookup = tcp_v4_md5_lookup,
2227 .calc_md5_hash = tcp_v4_md5_hash_skb,
2228 .md5_parse = tcp_v4_parse_md5_keys,
2232 /* NOTE: A lot of things set to zero explicitly by call to
2233 * sk_alloc() so need not be done here.
2235 static int tcp_v4_init_sock(struct sock *sk)
2237 struct inet_connection_sock *icsk = inet_csk(sk);
2241 icsk->icsk_af_ops = &ipv4_specific;
2243 #ifdef CONFIG_TCP_MD5SIG
2244 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2250 void tcp_v4_destroy_sock(struct sock *sk)
2252 struct tcp_sock *tp = tcp_sk(sk);
2254 trace_tcp_destroy_sock(sk);
2256 tcp_clear_xmit_timers(sk);
2258 tcp_cleanup_congestion_control(sk);
2260 tcp_cleanup_ulp(sk);
2262 /* Cleanup up the write buffer. */
2263 tcp_write_queue_purge(sk);
2265 /* Check if we want to disable active TFO */
2266 tcp_fastopen_active_disable_ofo_check(sk);
2268 /* Cleans up our, hopefully empty, out_of_order_queue. */
2269 skb_rbtree_purge(&tp->out_of_order_queue);
2271 #ifdef CONFIG_TCP_MD5SIG
2272 /* Clean up the MD5 key list, if any */
2273 if (tp->md5sig_info) {
2274 tcp_clear_md5_list(sk);
2275 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2276 tp->md5sig_info = NULL;
2280 /* Clean up a referenced TCP bind bucket. */
2281 if (inet_csk(sk)->icsk_bind_hash)
2284 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2286 /* If socket is aborted during connect operation */
2287 tcp_free_fastopen_req(tp);
2288 tcp_fastopen_destroy_cipher(sk);
2289 tcp_saved_syn_free(tp);
2291 sk_sockets_allocated_dec(sk);
2293 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2295 #ifdef CONFIG_PROC_FS
2296 /* Proc filesystem TCP sock list dumping. */
2298 static unsigned short seq_file_family(const struct seq_file *seq);
2300 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2302 unsigned short family = seq_file_family(seq);
2304 /* AF_UNSPEC is used as a match all */
2305 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2306 net_eq(sock_net(sk), seq_file_net(seq)));
2309 /* Find a non empty bucket (starting from st->bucket)
2310 * and return the first sk from it.
2312 static void *listening_get_first(struct seq_file *seq)
2314 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2315 struct tcp_iter_state *st = seq->private;
2318 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2319 struct inet_listen_hashbucket *ilb2;
2320 struct hlist_nulls_node *node;
2323 ilb2 = &hinfo->lhash2[st->bucket];
2324 if (hlist_nulls_empty(&ilb2->nulls_head))
2327 spin_lock(&ilb2->lock);
2328 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2329 if (seq_sk_match(seq, sk))
2332 spin_unlock(&ilb2->lock);
2338 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2339 * If "cur" is the last one in the st->bucket,
2340 * call listening_get_first() to return the first sk of the next
2343 static void *listening_get_next(struct seq_file *seq, void *cur)
2345 struct tcp_iter_state *st = seq->private;
2346 struct inet_listen_hashbucket *ilb2;
2347 struct hlist_nulls_node *node;
2348 struct inet_hashinfo *hinfo;
2349 struct sock *sk = cur;
2354 sk = sk_nulls_next(sk);
2355 sk_nulls_for_each_from(sk, node) {
2356 if (seq_sk_match(seq, sk))
2360 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2361 ilb2 = &hinfo->lhash2[st->bucket];
2362 spin_unlock(&ilb2->lock);
2364 return listening_get_first(seq);
2367 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2369 struct tcp_iter_state *st = seq->private;
2374 rc = listening_get_first(seq);
2376 while (rc && *pos) {
2377 rc = listening_get_next(seq, rc);
2383 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2384 const struct tcp_iter_state *st)
2386 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2390 * Get first established socket starting from bucket given in st->bucket.
2391 * If st->bucket is zero, the very first socket in the hash is returned.
2393 static void *established_get_first(struct seq_file *seq)
2395 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2396 struct tcp_iter_state *st = seq->private;
2399 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2401 struct hlist_nulls_node *node;
2402 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2404 /* Lockless fast path for the common case of empty buckets */
2405 if (empty_bucket(hinfo, st))
2409 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2410 if (seq_sk_match(seq, sk))
2413 spin_unlock_bh(lock);
2419 static void *established_get_next(struct seq_file *seq, void *cur)
2421 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2422 struct tcp_iter_state *st = seq->private;
2423 struct hlist_nulls_node *node;
2424 struct sock *sk = cur;
2429 sk = sk_nulls_next(sk);
2431 sk_nulls_for_each_from(sk, node) {
2432 if (seq_sk_match(seq, sk))
2436 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2438 return established_get_first(seq);
2441 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2443 struct tcp_iter_state *st = seq->private;
2447 rc = established_get_first(seq);
2450 rc = established_get_next(seq, rc);
2456 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2459 struct tcp_iter_state *st = seq->private;
2461 st->state = TCP_SEQ_STATE_LISTENING;
2462 rc = listening_get_idx(seq, &pos);
2465 st->state = TCP_SEQ_STATE_ESTABLISHED;
2466 rc = established_get_idx(seq, pos);
2472 static void *tcp_seek_last_pos(struct seq_file *seq)
2474 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2475 struct tcp_iter_state *st = seq->private;
2476 int bucket = st->bucket;
2477 int offset = st->offset;
2478 int orig_num = st->num;
2481 switch (st->state) {
2482 case TCP_SEQ_STATE_LISTENING:
2483 if (st->bucket > hinfo->lhash2_mask)
2485 st->state = TCP_SEQ_STATE_LISTENING;
2486 rc = listening_get_first(seq);
2487 while (offset-- && rc && bucket == st->bucket)
2488 rc = listening_get_next(seq, rc);
2492 st->state = TCP_SEQ_STATE_ESTABLISHED;
2494 case TCP_SEQ_STATE_ESTABLISHED:
2495 if (st->bucket > hinfo->ehash_mask)
2497 rc = established_get_first(seq);
2498 while (offset-- && rc && bucket == st->bucket)
2499 rc = established_get_next(seq, rc);
2507 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2509 struct tcp_iter_state *st = seq->private;
2512 if (*pos && *pos == st->last_pos) {
2513 rc = tcp_seek_last_pos(seq);
2518 st->state = TCP_SEQ_STATE_LISTENING;
2522 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2525 st->last_pos = *pos;
2528 EXPORT_SYMBOL(tcp_seq_start);
2530 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2532 struct tcp_iter_state *st = seq->private;
2535 if (v == SEQ_START_TOKEN) {
2536 rc = tcp_get_idx(seq, 0);
2540 switch (st->state) {
2541 case TCP_SEQ_STATE_LISTENING:
2542 rc = listening_get_next(seq, v);
2544 st->state = TCP_SEQ_STATE_ESTABLISHED;
2547 rc = established_get_first(seq);
2550 case TCP_SEQ_STATE_ESTABLISHED:
2551 rc = established_get_next(seq, v);
2556 st->last_pos = *pos;
2559 EXPORT_SYMBOL(tcp_seq_next);
2561 void tcp_seq_stop(struct seq_file *seq, void *v)
2563 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2564 struct tcp_iter_state *st = seq->private;
2566 switch (st->state) {
2567 case TCP_SEQ_STATE_LISTENING:
2568 if (v != SEQ_START_TOKEN)
2569 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2571 case TCP_SEQ_STATE_ESTABLISHED:
2573 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2577 EXPORT_SYMBOL(tcp_seq_stop);
2579 static void get_openreq4(const struct request_sock *req,
2580 struct seq_file *f, int i)
2582 const struct inet_request_sock *ireq = inet_rsk(req);
2583 long delta = req->rsk_timer.expires - jiffies;
2585 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2586 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2591 ntohs(ireq->ir_rmt_port),
2593 0, 0, /* could print option size, but that is af dependent. */
2594 1, /* timers active (only the expire timer) */
2595 jiffies_delta_to_clock_t(delta),
2597 from_kuid_munged(seq_user_ns(f),
2598 sock_i_uid(req->rsk_listener)),
2599 0, /* non standard timer */
2600 0, /* open_requests have no inode */
2605 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2608 unsigned long timer_expires;
2609 const struct tcp_sock *tp = tcp_sk(sk);
2610 const struct inet_connection_sock *icsk = inet_csk(sk);
2611 const struct inet_sock *inet = inet_sk(sk);
2612 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2613 __be32 dest = inet->inet_daddr;
2614 __be32 src = inet->inet_rcv_saddr;
2615 __u16 destp = ntohs(inet->inet_dport);
2616 __u16 srcp = ntohs(inet->inet_sport);
2620 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2621 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2622 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2624 timer_expires = icsk->icsk_timeout;
2625 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2627 timer_expires = icsk->icsk_timeout;
2628 } else if (timer_pending(&sk->sk_timer)) {
2630 timer_expires = sk->sk_timer.expires;
2633 timer_expires = jiffies;
2636 state = inet_sk_state_load(sk);
2637 if (state == TCP_LISTEN)
2638 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2640 /* Because we don't lock the socket,
2641 * we might find a transient negative value.
2643 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2644 READ_ONCE(tp->copied_seq), 0);
2646 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2647 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2648 i, src, srcp, dest, destp, state,
2649 READ_ONCE(tp->write_seq) - tp->snd_una,
2652 jiffies_delta_to_clock_t(timer_expires - jiffies),
2653 icsk->icsk_retransmits,
2654 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2655 icsk->icsk_probes_out,
2657 refcount_read(&sk->sk_refcnt), sk,
2658 jiffies_to_clock_t(icsk->icsk_rto),
2659 jiffies_to_clock_t(icsk->icsk_ack.ato),
2660 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2662 state == TCP_LISTEN ?
2663 fastopenq->max_qlen :
2664 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2667 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2668 struct seq_file *f, int i)
2670 long delta = tw->tw_timer.expires - jiffies;
2674 dest = tw->tw_daddr;
2675 src = tw->tw_rcv_saddr;
2676 destp = ntohs(tw->tw_dport);
2677 srcp = ntohs(tw->tw_sport);
2679 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2680 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2681 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2682 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2683 refcount_read(&tw->tw_refcnt), tw);
2688 static int tcp4_seq_show(struct seq_file *seq, void *v)
2690 struct tcp_iter_state *st;
2691 struct sock *sk = v;
2693 seq_setwidth(seq, TMPSZ - 1);
2694 if (v == SEQ_START_TOKEN) {
2695 seq_puts(seq, " sl local_address rem_address st tx_queue "
2696 "rx_queue tr tm->when retrnsmt uid timeout "
2702 if (sk->sk_state == TCP_TIME_WAIT)
2703 get_timewait4_sock(v, seq, st->num);
2704 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2705 get_openreq4(v, seq, st->num);
2707 get_tcp4_sock(v, seq, st->num);
2713 #ifdef CONFIG_BPF_SYSCALL
2714 struct bpf_tcp_iter_state {
2715 struct tcp_iter_state state;
2716 unsigned int cur_sk;
2717 unsigned int end_sk;
2718 unsigned int max_sk;
2719 struct sock **batch;
2720 bool st_bucket_done;
2723 struct bpf_iter__tcp {
2724 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2725 __bpf_md_ptr(struct sock_common *, sk_common);
2726 uid_t uid __aligned(8);
2729 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2730 struct sock_common *sk_common, uid_t uid)
2732 struct bpf_iter__tcp ctx;
2734 meta->seq_num--; /* skip SEQ_START_TOKEN */
2736 ctx.sk_common = sk_common;
2738 return bpf_iter_run_prog(prog, &ctx);
2741 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2743 while (iter->cur_sk < iter->end_sk)
2744 sock_put(iter->batch[iter->cur_sk++]);
2747 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2748 unsigned int new_batch_sz)
2750 struct sock **new_batch;
2752 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2753 GFP_USER | __GFP_NOWARN);
2757 bpf_iter_tcp_put_batch(iter);
2758 kvfree(iter->batch);
2759 iter->batch = new_batch;
2760 iter->max_sk = new_batch_sz;
2765 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2766 struct sock *start_sk)
2768 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2769 struct bpf_tcp_iter_state *iter = seq->private;
2770 struct tcp_iter_state *st = &iter->state;
2771 struct hlist_nulls_node *node;
2772 unsigned int expected = 1;
2775 sock_hold(start_sk);
2776 iter->batch[iter->end_sk++] = start_sk;
2778 sk = sk_nulls_next(start_sk);
2779 sk_nulls_for_each_from(sk, node) {
2780 if (seq_sk_match(seq, sk)) {
2781 if (iter->end_sk < iter->max_sk) {
2783 iter->batch[iter->end_sk++] = sk;
2788 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2793 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2794 struct sock *start_sk)
2796 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2797 struct bpf_tcp_iter_state *iter = seq->private;
2798 struct tcp_iter_state *st = &iter->state;
2799 struct hlist_nulls_node *node;
2800 unsigned int expected = 1;
2803 sock_hold(start_sk);
2804 iter->batch[iter->end_sk++] = start_sk;
2806 sk = sk_nulls_next(start_sk);
2807 sk_nulls_for_each_from(sk, node) {
2808 if (seq_sk_match(seq, sk)) {
2809 if (iter->end_sk < iter->max_sk) {
2811 iter->batch[iter->end_sk++] = sk;
2816 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2821 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2823 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2824 struct bpf_tcp_iter_state *iter = seq->private;
2825 struct tcp_iter_state *st = &iter->state;
2826 unsigned int expected;
2827 bool resized = false;
2830 /* The st->bucket is done. Directly advance to the next
2831 * bucket instead of having the tcp_seek_last_pos() to skip
2832 * one by one in the current bucket and eventually find out
2833 * it has to advance to the next bucket.
2835 if (iter->st_bucket_done) {
2838 if (st->state == TCP_SEQ_STATE_LISTENING &&
2839 st->bucket > hinfo->lhash2_mask) {
2840 st->state = TCP_SEQ_STATE_ESTABLISHED;
2846 /* Get a new batch */
2849 iter->st_bucket_done = false;
2851 sk = tcp_seek_last_pos(seq);
2853 return NULL; /* Done */
2855 if (st->state == TCP_SEQ_STATE_LISTENING)
2856 expected = bpf_iter_tcp_listening_batch(seq, sk);
2858 expected = bpf_iter_tcp_established_batch(seq, sk);
2860 if (iter->end_sk == expected) {
2861 iter->st_bucket_done = true;
2865 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2873 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2875 /* bpf iter does not support lseek, so it always
2876 * continue from where it was stop()-ped.
2879 return bpf_iter_tcp_batch(seq);
2881 return SEQ_START_TOKEN;
2884 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2886 struct bpf_tcp_iter_state *iter = seq->private;
2887 struct tcp_iter_state *st = &iter->state;
2890 /* Whenever seq_next() is called, the iter->cur_sk is
2891 * done with seq_show(), so advance to the next sk in
2894 if (iter->cur_sk < iter->end_sk) {
2895 /* Keeping st->num consistent in tcp_iter_state.
2896 * bpf_iter_tcp does not use st->num.
2897 * meta.seq_num is used instead.
2900 /* Move st->offset to the next sk in the bucket such that
2901 * the future start() will resume at st->offset in
2902 * st->bucket. See tcp_seek_last_pos().
2905 sock_put(iter->batch[iter->cur_sk++]);
2908 if (iter->cur_sk < iter->end_sk)
2909 sk = iter->batch[iter->cur_sk];
2911 sk = bpf_iter_tcp_batch(seq);
2914 /* Keeping st->last_pos consistent in tcp_iter_state.
2915 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2917 st->last_pos = *pos;
2921 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2923 struct bpf_iter_meta meta;
2924 struct bpf_prog *prog;
2925 struct sock *sk = v;
2930 if (v == SEQ_START_TOKEN)
2933 if (sk_fullsock(sk))
2934 slow = lock_sock_fast(sk);
2936 if (unlikely(sk_unhashed(sk))) {
2941 if (sk->sk_state == TCP_TIME_WAIT) {
2943 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2944 const struct request_sock *req = v;
2946 uid = from_kuid_munged(seq_user_ns(seq),
2947 sock_i_uid(req->rsk_listener));
2949 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2953 prog = bpf_iter_get_info(&meta, false);
2954 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2957 if (sk_fullsock(sk))
2958 unlock_sock_fast(sk, slow);
2963 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2965 struct bpf_tcp_iter_state *iter = seq->private;
2966 struct bpf_iter_meta meta;
2967 struct bpf_prog *prog;
2971 prog = bpf_iter_get_info(&meta, true);
2973 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2976 if (iter->cur_sk < iter->end_sk) {
2977 bpf_iter_tcp_put_batch(iter);
2978 iter->st_bucket_done = false;
2982 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2983 .show = bpf_iter_tcp_seq_show,
2984 .start = bpf_iter_tcp_seq_start,
2985 .next = bpf_iter_tcp_seq_next,
2986 .stop = bpf_iter_tcp_seq_stop,
2989 static unsigned short seq_file_family(const struct seq_file *seq)
2991 const struct tcp_seq_afinfo *afinfo;
2993 #ifdef CONFIG_BPF_SYSCALL
2994 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
2995 if (seq->op == &bpf_iter_tcp_seq_ops)
2999 /* Iterated from proc fs */
3000 afinfo = pde_data(file_inode(seq->file));
3001 return afinfo->family;
3004 static const struct seq_operations tcp4_seq_ops = {
3005 .show = tcp4_seq_show,
3006 .start = tcp_seq_start,
3007 .next = tcp_seq_next,
3008 .stop = tcp_seq_stop,
3011 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3015 static int __net_init tcp4_proc_init_net(struct net *net)
3017 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3018 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3023 static void __net_exit tcp4_proc_exit_net(struct net *net)
3025 remove_proc_entry("tcp", net->proc_net);
3028 static struct pernet_operations tcp4_net_ops = {
3029 .init = tcp4_proc_init_net,
3030 .exit = tcp4_proc_exit_net,
3033 int __init tcp4_proc_init(void)
3035 return register_pernet_subsys(&tcp4_net_ops);
3038 void tcp4_proc_exit(void)
3040 unregister_pernet_subsys(&tcp4_net_ops);
3042 #endif /* CONFIG_PROC_FS */
3044 /* @wake is one when sk_stream_write_space() calls us.
3045 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3046 * This mimics the strategy used in sock_def_write_space().
3048 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3050 const struct tcp_sock *tp = tcp_sk(sk);
3051 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3052 READ_ONCE(tp->snd_nxt);
3054 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3056 EXPORT_SYMBOL(tcp_stream_memory_free);
3058 struct proto tcp_prot = {
3060 .owner = THIS_MODULE,
3062 .pre_connect = tcp_v4_pre_connect,
3063 .connect = tcp_v4_connect,
3064 .disconnect = tcp_disconnect,
3065 .accept = inet_csk_accept,
3067 .init = tcp_v4_init_sock,
3068 .destroy = tcp_v4_destroy_sock,
3069 .shutdown = tcp_shutdown,
3070 .setsockopt = tcp_setsockopt,
3071 .getsockopt = tcp_getsockopt,
3072 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3073 .keepalive = tcp_set_keepalive,
3074 .recvmsg = tcp_recvmsg,
3075 .sendmsg = tcp_sendmsg,
3076 .sendpage = tcp_sendpage,
3077 .backlog_rcv = tcp_v4_do_rcv,
3078 .release_cb = tcp_release_cb,
3080 .unhash = inet_unhash,
3081 .get_port = inet_csk_get_port,
3082 .put_port = inet_put_port,
3083 #ifdef CONFIG_BPF_SYSCALL
3084 .psock_update_sk_prot = tcp_bpf_update_proto,
3086 .enter_memory_pressure = tcp_enter_memory_pressure,
3087 .leave_memory_pressure = tcp_leave_memory_pressure,
3088 .stream_memory_free = tcp_stream_memory_free,
3089 .sockets_allocated = &tcp_sockets_allocated,
3090 .orphan_count = &tcp_orphan_count,
3092 .memory_allocated = &tcp_memory_allocated,
3093 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3095 .memory_pressure = &tcp_memory_pressure,
3096 .sysctl_mem = sysctl_tcp_mem,
3097 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3098 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3099 .max_header = MAX_TCP_HEADER,
3100 .obj_size = sizeof(struct tcp_sock),
3101 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3102 .twsk_prot = &tcp_timewait_sock_ops,
3103 .rsk_prot = &tcp_request_sock_ops,
3105 .no_autobind = true,
3106 .diag_destroy = tcp_abort,
3108 EXPORT_SYMBOL(tcp_prot);
3110 static void __net_exit tcp_sk_exit(struct net *net)
3112 if (net->ipv4.tcp_congestion_control)
3113 bpf_module_put(net->ipv4.tcp_congestion_control,
3114 net->ipv4.tcp_congestion_control->owner);
3117 static void __net_init tcp_set_hashinfo(struct net *net)
3119 struct inet_hashinfo *hinfo;
3120 unsigned int ehash_entries;
3121 struct net *old_net;
3123 if (net_eq(net, &init_net))
3126 old_net = current->nsproxy->net_ns;
3127 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3131 ehash_entries = roundup_pow_of_two(ehash_entries);
3132 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3134 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3135 "for a netns, fallback to the global one\n",
3138 hinfo = &tcp_hashinfo;
3139 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3142 net->ipv4.tcp_death_row.hashinfo = hinfo;
3143 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3144 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3147 static int __net_init tcp_sk_init(struct net *net)
3149 net->ipv4.sysctl_tcp_ecn = 2;
3150 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3152 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3153 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3154 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3155 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3156 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3158 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3159 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3160 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3162 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3163 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3164 net->ipv4.sysctl_tcp_syncookies = 1;
3165 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3166 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3167 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3168 net->ipv4.sysctl_tcp_orphan_retries = 0;
3169 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3170 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3171 net->ipv4.sysctl_tcp_tw_reuse = 2;
3172 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3174 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3175 tcp_set_hashinfo(net);
3177 net->ipv4.sysctl_tcp_sack = 1;
3178 net->ipv4.sysctl_tcp_window_scaling = 1;
3179 net->ipv4.sysctl_tcp_timestamps = 1;
3180 net->ipv4.sysctl_tcp_early_retrans = 3;
3181 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3182 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3183 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3184 net->ipv4.sysctl_tcp_max_reordering = 300;
3185 net->ipv4.sysctl_tcp_dsack = 1;
3186 net->ipv4.sysctl_tcp_app_win = 31;
3187 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3188 net->ipv4.sysctl_tcp_frto = 2;
3189 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3190 /* This limits the percentage of the congestion window which we
3191 * will allow a single TSO frame to consume. Building TSO frames
3192 * which are too large can cause TCP streams to be bursty.
3194 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3195 /* Default TSQ limit of 16 TSO segments */
3196 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3198 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3199 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3201 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3202 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3203 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3204 net->ipv4.sysctl_tcp_autocorking = 1;
3205 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3206 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3207 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3208 if (net != &init_net) {
3209 memcpy(net->ipv4.sysctl_tcp_rmem,
3210 init_net.ipv4.sysctl_tcp_rmem,
3211 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3212 memcpy(net->ipv4.sysctl_tcp_wmem,
3213 init_net.ipv4.sysctl_tcp_wmem,
3214 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3216 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3217 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3218 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3219 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3220 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3221 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3223 /* Reno is always built in */
3224 if (!net_eq(net, &init_net) &&
3225 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3226 init_net.ipv4.tcp_congestion_control->owner))
3227 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3229 net->ipv4.tcp_congestion_control = &tcp_reno;
3234 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3238 tcp_twsk_purge(net_exit_list, AF_INET);
3240 list_for_each_entry(net, net_exit_list, exit_list) {
3241 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3242 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3243 tcp_fastopen_ctx_destroy(net);
3247 static struct pernet_operations __net_initdata tcp_sk_ops = {
3248 .init = tcp_sk_init,
3249 .exit = tcp_sk_exit,
3250 .exit_batch = tcp_sk_exit_batch,
3253 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3254 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3255 struct sock_common *sk_common, uid_t uid)
3257 #define INIT_BATCH_SZ 16
3259 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3261 struct bpf_tcp_iter_state *iter = priv_data;
3264 err = bpf_iter_init_seq_net(priv_data, aux);
3268 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3270 bpf_iter_fini_seq_net(priv_data);
3277 static void bpf_iter_fini_tcp(void *priv_data)
3279 struct bpf_tcp_iter_state *iter = priv_data;
3281 bpf_iter_fini_seq_net(priv_data);
3282 kvfree(iter->batch);
3285 static const struct bpf_iter_seq_info tcp_seq_info = {
3286 .seq_ops = &bpf_iter_tcp_seq_ops,
3287 .init_seq_private = bpf_iter_init_tcp,
3288 .fini_seq_private = bpf_iter_fini_tcp,
3289 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3292 static const struct bpf_func_proto *
3293 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3294 const struct bpf_prog *prog)
3297 case BPF_FUNC_setsockopt:
3298 return &bpf_sk_setsockopt_proto;
3299 case BPF_FUNC_getsockopt:
3300 return &bpf_sk_getsockopt_proto;
3306 static struct bpf_iter_reg tcp_reg_info = {
3308 .ctx_arg_info_size = 1,
3310 { offsetof(struct bpf_iter__tcp, sk_common),
3311 PTR_TO_BTF_ID_OR_NULL },
3313 .get_func_proto = bpf_iter_tcp_get_func_proto,
3314 .seq_info = &tcp_seq_info,
3317 static void __init bpf_iter_register(void)
3319 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3320 if (bpf_iter_reg_target(&tcp_reg_info))
3321 pr_warn("Warning: could not register bpf iterator tcp\n");
3326 void __init tcp_v4_init(void)
3330 for_each_possible_cpu(cpu) {
3333 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3334 IPPROTO_TCP, &init_net);
3336 panic("Failed to create the TCP control socket.\n");
3337 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3339 /* Please enforce IP_DF and IPID==0 for RST and
3340 * ACK sent in SYN-RECV and TIME-WAIT state.
3342 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3344 per_cpu(ipv4_tcp_sk, cpu) = sk;
3346 if (register_pernet_subsys(&tcp_sk_ops))
3347 panic("Failed to create the TCP control socket.\n");
3349 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3350 bpf_iter_register();