1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 tcp_hdr(skb)->source);
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
191 if (addr_len < sizeof(struct sockaddr_in))
194 sock_owned_by_me(sk);
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 struct inet_timewait_death_row *tcp_death_row;
205 __be32 daddr, nexthop, prev_sk_rcv_saddr;
206 struct inet_sock *inet = inet_sk(sk);
207 struct tcp_sock *tp = tcp_sk(sk);
208 struct ip_options_rcu *inet_opt;
209 struct net *net = sock_net(sk);
210 __be16 orig_sport, orig_dport;
215 if (addr_len < sizeof(struct sockaddr_in))
218 if (usin->sin_family != AF_INET)
219 return -EAFNOSUPPORT;
221 nexthop = daddr = usin->sin_addr.s_addr;
222 inet_opt = rcu_dereference_protected(inet->inet_opt,
223 lockdep_sock_is_held(sk));
224 if (inet_opt && inet_opt->opt.srr) {
227 nexthop = inet_opt->opt.faddr;
230 orig_sport = inet->inet_sport;
231 orig_dport = usin->sin_port;
232 fl4 = &inet->cork.fl.u.ip4;
233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
238 if (err == -ENETUNREACH)
239 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
243 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
248 if (!inet_opt || !inet_opt->opt.srr)
251 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
253 if (!inet->inet_saddr) {
254 if (inet_csk(sk)->icsk_bind2_hash) {
255 prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
256 sk, net, inet->inet_num);
257 prev_sk_rcv_saddr = sk->sk_rcv_saddr;
259 inet->inet_saddr = fl4->saddr;
262 sk_rcv_saddr_set(sk, inet->inet_saddr);
264 if (prev_addr_hashbucket) {
265 err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
267 inet->inet_saddr = 0;
268 sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
274 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
275 /* Reset inherited state */
276 tp->rx_opt.ts_recent = 0;
277 tp->rx_opt.ts_recent_stamp = 0;
278 if (likely(!tp->repair))
279 WRITE_ONCE(tp->write_seq, 0);
282 inet->inet_dport = usin->sin_port;
283 sk_daddr_set(sk, daddr);
285 inet_csk(sk)->icsk_ext_hdr_len = 0;
287 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
289 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
291 /* Socket identity is still unknown (sport may be zero).
292 * However we set state to SYN-SENT and not releasing socket
293 * lock select source port, enter ourselves into the hash tables and
294 * complete initialization after this.
296 tcp_set_state(sk, TCP_SYN_SENT);
297 err = inet_hash_connect(tcp_death_row, sk);
303 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
304 inet->inet_sport, inet->inet_dport, sk);
310 /* OK, now commit destination to socket. */
311 sk->sk_gso_type = SKB_GSO_TCPV4;
312 sk_setup_caps(sk, &rt->dst);
315 if (likely(!tp->repair)) {
317 WRITE_ONCE(tp->write_seq,
318 secure_tcp_seq(inet->inet_saddr,
322 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
326 inet->inet_id = get_random_u16();
328 if (tcp_fastopen_defer_connect(sk, &err))
333 err = tcp_connect(sk);
342 * This unhashes the socket and releases the local port,
345 tcp_set_state(sk, TCP_CLOSE);
347 sk->sk_route_caps = 0;
348 inet->inet_dport = 0;
351 EXPORT_SYMBOL(tcp_v4_connect);
354 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
355 * It can be called through tcp_release_cb() if socket was owned by user
356 * at the time tcp_v4_err() was called to handle ICMP message.
358 void tcp_v4_mtu_reduced(struct sock *sk)
360 struct inet_sock *inet = inet_sk(sk);
361 struct dst_entry *dst;
364 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
366 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
367 dst = inet_csk_update_pmtu(sk, mtu);
371 /* Something is about to be wrong... Remember soft error
372 * for the case, if this connection will not able to recover.
374 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
375 sk->sk_err_soft = EMSGSIZE;
379 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
380 ip_sk_accept_pmtu(sk) &&
381 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
382 tcp_sync_mss(sk, mtu);
384 /* Resend the TCP packet because it's
385 * clear that the old packet has been
386 * dropped. This is the new "fast" path mtu
389 tcp_simple_retransmit(sk);
390 } /* else let the usual retransmit timer handle it */
392 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
394 static void do_redirect(struct sk_buff *skb, struct sock *sk)
396 struct dst_entry *dst = __sk_dst_check(sk, 0);
399 dst->ops->redirect(dst, sk, skb);
403 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
404 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
406 struct request_sock *req = inet_reqsk(sk);
407 struct net *net = sock_net(sk);
409 /* ICMPs are not backlogged, hence we cannot get
410 * an established socket here.
412 if (seq != tcp_rsk(req)->snt_isn) {
413 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
416 * Still in SYN_RECV, just remove it silently.
417 * There is no good way to pass the error to the newly
418 * created socket, and POSIX does not want network
419 * errors returned from accept().
421 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
422 tcp_listendrop(req->rsk_listener);
426 EXPORT_SYMBOL(tcp_req_err);
428 /* TCP-LD (RFC 6069) logic */
429 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
431 struct inet_connection_sock *icsk = inet_csk(sk);
432 struct tcp_sock *tp = tcp_sk(sk);
437 if (sock_owned_by_user(sk))
440 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
444 skb = tcp_rtx_queue_head(sk);
445 if (WARN_ON_ONCE(!skb))
448 icsk->icsk_backoff--;
449 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
450 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
452 tcp_mstamp_refresh(tp);
453 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
454 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
457 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
458 remaining, TCP_RTO_MAX);
460 /* RTO revert clocked out retransmission.
461 * Will retransmit now.
463 tcp_retransmit_timer(sk);
466 EXPORT_SYMBOL(tcp_ld_RTO_revert);
469 * This routine is called by the ICMP module when it gets some
470 * sort of error condition. If err < 0 then the socket should
471 * be closed and the error returned to the user. If err > 0
472 * it's just the icmp type << 8 | icmp code. After adjustment
473 * header points to the first 8 bytes of the tcp header. We need
474 * to find the appropriate port.
476 * The locking strategy used here is very "optimistic". When
477 * someone else accesses the socket the ICMP is just dropped
478 * and for some paths there is no check at all.
479 * A more general error queue to queue errors for later handling
480 * is probably better.
484 int tcp_v4_err(struct sk_buff *skb, u32 info)
486 const struct iphdr *iph = (const struct iphdr *)skb->data;
487 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
489 struct inet_sock *inet;
490 const int type = icmp_hdr(skb)->type;
491 const int code = icmp_hdr(skb)->code;
493 struct request_sock *fastopen;
496 struct net *net = dev_net(skb->dev);
498 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
499 iph->daddr, th->dest, iph->saddr,
500 ntohs(th->source), inet_iif(skb), 0);
502 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
505 if (sk->sk_state == TCP_TIME_WAIT) {
506 inet_twsk_put(inet_twsk(sk));
509 seq = ntohl(th->seq);
510 if (sk->sk_state == TCP_NEW_SYN_RECV) {
511 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
512 type == ICMP_TIME_EXCEEDED ||
513 (type == ICMP_DEST_UNREACH &&
514 (code == ICMP_NET_UNREACH ||
515 code == ICMP_HOST_UNREACH)));
520 /* If too many ICMPs get dropped on busy
521 * servers this needs to be solved differently.
522 * We do take care of PMTU discovery (RFC1191) special case :
523 * we can receive locally generated ICMP messages while socket is held.
525 if (sock_owned_by_user(sk)) {
526 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
527 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
529 if (sk->sk_state == TCP_CLOSE)
532 if (static_branch_unlikely(&ip4_min_ttl)) {
533 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
534 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
535 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
541 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
542 fastopen = rcu_dereference(tp->fastopen_rsk);
543 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
544 if (sk->sk_state != TCP_LISTEN &&
545 !between(seq, snd_una, tp->snd_nxt)) {
546 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
552 if (!sock_owned_by_user(sk))
553 do_redirect(skb, sk);
555 case ICMP_SOURCE_QUENCH:
556 /* Just silently ignore these. */
558 case ICMP_PARAMETERPROB:
561 case ICMP_DEST_UNREACH:
562 if (code > NR_ICMP_UNREACH)
565 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
566 /* We are not interested in TCP_LISTEN and open_requests
567 * (SYN-ACKs send out by Linux are always <576bytes so
568 * they should go through unfragmented).
570 if (sk->sk_state == TCP_LISTEN)
573 WRITE_ONCE(tp->mtu_info, info);
574 if (!sock_owned_by_user(sk)) {
575 tcp_v4_mtu_reduced(sk);
577 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
583 err = icmp_err_convert[code].errno;
584 /* check if this ICMP message allows revert of backoff.
588 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
589 tcp_ld_RTO_revert(sk, seq);
591 case ICMP_TIME_EXCEEDED:
598 switch (sk->sk_state) {
601 /* Only in fast or simultaneous open. If a fast open socket is
602 * already accepted it is treated as a connected one below.
604 if (fastopen && !fastopen->sk)
607 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
609 if (!sock_owned_by_user(sk)) {
616 sk->sk_err_soft = err;
621 /* If we've already connected we will keep trying
622 * until we time out, or the user gives up.
624 * rfc1122 4.2.3.9 allows to consider as hard errors
625 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
626 * but it is obsoleted by pmtu discovery).
628 * Note, that in modern internet, where routing is unreliable
629 * and in each dark corner broken firewalls sit, sending random
630 * errors ordered by their masters even this two messages finally lose
631 * their original sense (even Linux sends invalid PORT_UNREACHs)
633 * Now we are in compliance with RFCs.
638 if (!sock_owned_by_user(sk) && inet->recverr) {
641 } else { /* Only an error on timeout */
642 sk->sk_err_soft = err;
651 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
653 struct tcphdr *th = tcp_hdr(skb);
655 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
656 skb->csum_start = skb_transport_header(skb) - skb->head;
657 skb->csum_offset = offsetof(struct tcphdr, check);
660 /* This routine computes an IPv4 TCP checksum. */
661 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
663 const struct inet_sock *inet = inet_sk(sk);
665 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
667 EXPORT_SYMBOL(tcp_v4_send_check);
670 * This routine will send an RST to the other tcp.
672 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
674 * Answer: if a packet caused RST, it is not for a socket
675 * existing in our system, if it is matched to a socket,
676 * it is just duplicate segment or bug in other side's TCP.
677 * So that we build reply only basing on parameters
678 * arrived with segment.
679 * Exception: precedence violation. We do not implement it in any case.
682 #ifdef CONFIG_TCP_MD5SIG
683 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
685 #define OPTION_BYTES sizeof(__be32)
688 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
690 const struct tcphdr *th = tcp_hdr(skb);
693 __be32 opt[OPTION_BYTES / sizeof(__be32)];
695 struct ip_reply_arg arg;
696 #ifdef CONFIG_TCP_MD5SIG
697 struct tcp_md5sig_key *key = NULL;
698 const __u8 *hash_location = NULL;
699 unsigned char newhash[16];
701 struct sock *sk1 = NULL;
703 u64 transmit_time = 0;
707 /* Never send a reset in response to a reset. */
711 /* If sk not NULL, it means we did a successful lookup and incoming
712 * route had to be correct. prequeue might have dropped our dst.
714 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
717 /* Swap the send and the receive. */
718 memset(&rep, 0, sizeof(rep));
719 rep.th.dest = th->source;
720 rep.th.source = th->dest;
721 rep.th.doff = sizeof(struct tcphdr) / 4;
725 rep.th.seq = th->ack_seq;
728 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
729 skb->len - (th->doff << 2));
732 memset(&arg, 0, sizeof(arg));
733 arg.iov[0].iov_base = (unsigned char *)&rep;
734 arg.iov[0].iov_len = sizeof(rep.th);
736 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
737 #ifdef CONFIG_TCP_MD5SIG
739 hash_location = tcp_parse_md5sig_option(th);
740 if (sk && sk_fullsock(sk)) {
741 const union tcp_md5_addr *addr;
744 /* sdif set, means packet ingressed via a device
745 * in an L3 domain and inet_iif is set to it.
747 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
748 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
749 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
750 } else if (hash_location) {
751 const union tcp_md5_addr *addr;
752 int sdif = tcp_v4_sdif(skb);
753 int dif = inet_iif(skb);
757 * active side is lost. Try to find listening socket through
758 * source port, and then find md5 key through listening socket.
759 * we are not loose security here:
760 * Incoming packet is checked with md5 hash with finding key,
761 * no RST generated if md5 hash doesn't match.
763 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
764 NULL, 0, ip_hdr(skb)->saddr,
765 th->source, ip_hdr(skb)->daddr,
766 ntohs(th->source), dif, sdif);
767 /* don't send rst if it can't find key */
771 /* sdif set, means packet ingressed via a device
772 * in an L3 domain and dif is set to it.
774 l3index = sdif ? dif : 0;
775 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
776 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
781 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
782 if (genhash || memcmp(hash_location, newhash, 16) != 0)
788 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
790 (TCPOPT_MD5SIG << 8) |
792 /* Update length and the length the header thinks exists */
793 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
794 rep.th.doff = arg.iov[0].iov_len / 4;
796 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
797 key, ip_hdr(skb)->saddr,
798 ip_hdr(skb)->daddr, &rep.th);
801 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
802 if (rep.opt[0] == 0) {
803 __be32 mrst = mptcp_reset_option(skb);
807 arg.iov[0].iov_len += sizeof(mrst);
808 rep.th.doff = arg.iov[0].iov_len / 4;
812 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
813 ip_hdr(skb)->saddr, /* XXX */
814 arg.iov[0].iov_len, IPPROTO_TCP, 0);
815 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
816 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
818 /* When socket is gone, all binding information is lost.
819 * routing might fail in this case. No choice here, if we choose to force
820 * input interface, we will misroute in case of asymmetric route.
823 arg.bound_dev_if = sk->sk_bound_dev_if;
825 trace_tcp_send_reset(sk, skb);
828 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
829 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
831 arg.tos = ip_hdr(skb)->tos;
832 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
834 ctl_sk = this_cpu_read(ipv4_tcp_sk);
835 sock_net_set(ctl_sk, net);
837 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
838 inet_twsk(sk)->tw_mark : sk->sk_mark;
839 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
840 inet_twsk(sk)->tw_priority : sk->sk_priority;
841 transmit_time = tcp_transmit_time(sk);
842 xfrm_sk_clone_policy(ctl_sk, sk);
844 ip_send_unicast_reply(ctl_sk,
845 skb, &TCP_SKB_CB(skb)->header.h4.opt,
846 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
847 &arg, arg.iov[0].iov_len,
851 xfrm_sk_free_policy(ctl_sk);
852 sock_net_set(ctl_sk, &init_net);
853 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
854 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
857 #ifdef CONFIG_TCP_MD5SIG
863 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
864 outside socket context is ugly, certainly. What can I do?
867 static void tcp_v4_send_ack(const struct sock *sk,
868 struct sk_buff *skb, u32 seq, u32 ack,
869 u32 win, u32 tsval, u32 tsecr, int oif,
870 struct tcp_md5sig_key *key,
871 int reply_flags, u8 tos)
873 const struct tcphdr *th = tcp_hdr(skb);
876 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
877 #ifdef CONFIG_TCP_MD5SIG
878 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
882 struct net *net = sock_net(sk);
883 struct ip_reply_arg arg;
887 memset(&rep.th, 0, sizeof(struct tcphdr));
888 memset(&arg, 0, sizeof(arg));
890 arg.iov[0].iov_base = (unsigned char *)&rep;
891 arg.iov[0].iov_len = sizeof(rep.th);
893 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
894 (TCPOPT_TIMESTAMP << 8) |
896 rep.opt[1] = htonl(tsval);
897 rep.opt[2] = htonl(tsecr);
898 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
901 /* Swap the send and the receive. */
902 rep.th.dest = th->source;
903 rep.th.source = th->dest;
904 rep.th.doff = arg.iov[0].iov_len / 4;
905 rep.th.seq = htonl(seq);
906 rep.th.ack_seq = htonl(ack);
908 rep.th.window = htons(win);
910 #ifdef CONFIG_TCP_MD5SIG
912 int offset = (tsecr) ? 3 : 0;
914 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
916 (TCPOPT_MD5SIG << 8) |
918 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
919 rep.th.doff = arg.iov[0].iov_len/4;
921 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
922 key, ip_hdr(skb)->saddr,
923 ip_hdr(skb)->daddr, &rep.th);
926 arg.flags = reply_flags;
927 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
928 ip_hdr(skb)->saddr, /* XXX */
929 arg.iov[0].iov_len, IPPROTO_TCP, 0);
930 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
932 arg.bound_dev_if = oif;
934 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
936 ctl_sk = this_cpu_read(ipv4_tcp_sk);
937 sock_net_set(ctl_sk, net);
938 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
939 inet_twsk(sk)->tw_mark : sk->sk_mark;
940 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
941 inet_twsk(sk)->tw_priority : sk->sk_priority;
942 transmit_time = tcp_transmit_time(sk);
943 ip_send_unicast_reply(ctl_sk,
944 skb, &TCP_SKB_CB(skb)->header.h4.opt,
945 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
946 &arg, arg.iov[0].iov_len,
950 sock_net_set(ctl_sk, &init_net);
951 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
957 struct inet_timewait_sock *tw = inet_twsk(sk);
958 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
960 tcp_v4_send_ack(sk, skb,
961 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
962 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
963 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
966 tcp_twsk_md5_key(tcptw),
967 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
974 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
975 struct request_sock *req)
977 const union tcp_md5_addr *addr;
980 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
981 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
983 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
987 * The window field (SEG.WND) of every outgoing segment, with the
988 * exception of <SYN> segments, MUST be right-shifted by
989 * Rcv.Wind.Shift bits:
991 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
992 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
993 tcp_v4_send_ack(sk, skb, seq,
994 tcp_rsk(req)->rcv_nxt,
995 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
996 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
999 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
1000 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1005 * Send a SYN-ACK after having received a SYN.
1006 * This still operates on a request_sock only, not on a big
1009 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1011 struct request_sock *req,
1012 struct tcp_fastopen_cookie *foc,
1013 enum tcp_synack_type synack_type,
1014 struct sk_buff *syn_skb)
1016 const struct inet_request_sock *ireq = inet_rsk(req);
1019 struct sk_buff *skb;
1022 /* First, grab a route. */
1023 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1026 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1029 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1031 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1032 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1033 (inet_sk(sk)->tos & INET_ECN_MASK) :
1036 if (!INET_ECN_is_capable(tos) &&
1037 tcp_bpf_ca_needs_ecn((struct sock *)req))
1038 tos |= INET_ECN_ECT_0;
1041 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1043 rcu_dereference(ireq->ireq_opt),
1046 err = net_xmit_eval(err);
1053 * IPv4 request_sock destructor.
1055 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1057 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1060 #ifdef CONFIG_TCP_MD5SIG
1062 * RFC2385 MD5 checksumming requires a mapping of
1063 * IP address->MD5 Key.
1064 * We need to maintain these in the sk structure.
1067 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1068 EXPORT_SYMBOL(tcp_md5_needed);
1070 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1075 /* l3index always overrides non-l3index */
1076 if (old->l3index && new->l3index == 0)
1078 if (old->l3index == 0 && new->l3index)
1081 return old->prefixlen < new->prefixlen;
1084 /* Find the Key structure for an address. */
1085 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1086 const union tcp_md5_addr *addr,
1089 const struct tcp_sock *tp = tcp_sk(sk);
1090 struct tcp_md5sig_key *key;
1091 const struct tcp_md5sig_info *md5sig;
1093 struct tcp_md5sig_key *best_match = NULL;
1096 /* caller either holds rcu_read_lock() or socket lock */
1097 md5sig = rcu_dereference_check(tp->md5sig_info,
1098 lockdep_sock_is_held(sk));
1102 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1103 lockdep_sock_is_held(sk)) {
1104 if (key->family != family)
1106 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1108 if (family == AF_INET) {
1109 mask = inet_make_mask(key->prefixlen);
1110 match = (key->addr.a4.s_addr & mask) ==
1111 (addr->a4.s_addr & mask);
1112 #if IS_ENABLED(CONFIG_IPV6)
1113 } else if (family == AF_INET6) {
1114 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1121 if (match && better_md5_match(best_match, key))
1126 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1128 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1129 const union tcp_md5_addr *addr,
1130 int family, u8 prefixlen,
1131 int l3index, u8 flags)
1133 const struct tcp_sock *tp = tcp_sk(sk);
1134 struct tcp_md5sig_key *key;
1135 unsigned int size = sizeof(struct in_addr);
1136 const struct tcp_md5sig_info *md5sig;
1138 /* caller either holds rcu_read_lock() or socket lock */
1139 md5sig = rcu_dereference_check(tp->md5sig_info,
1140 lockdep_sock_is_held(sk));
1143 #if IS_ENABLED(CONFIG_IPV6)
1144 if (family == AF_INET6)
1145 size = sizeof(struct in6_addr);
1147 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1148 lockdep_sock_is_held(sk)) {
1149 if (key->family != family)
1151 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1153 if (key->l3index != l3index)
1155 if (!memcmp(&key->addr, addr, size) &&
1156 key->prefixlen == prefixlen)
1162 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1163 const struct sock *addr_sk)
1165 const union tcp_md5_addr *addr;
1168 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1169 addr_sk->sk_bound_dev_if);
1170 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1171 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1173 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1175 /* This can be called on a newly created socket, from other files */
1176 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1177 int family, u8 prefixlen, int l3index, u8 flags,
1178 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1180 /* Add Key to the list */
1181 struct tcp_md5sig_key *key;
1182 struct tcp_sock *tp = tcp_sk(sk);
1183 struct tcp_md5sig_info *md5sig;
1185 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1187 /* Pre-existing entry - just update that one.
1188 * Note that the key might be used concurrently.
1189 * data_race() is telling kcsan that we do not care of
1190 * key mismatches, since changing MD5 key on live flows
1191 * can lead to packet drops.
1193 data_race(memcpy(key->key, newkey, newkeylen));
1195 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1196 * Also note that a reader could catch new key->keylen value
1197 * but old key->key[], this is the reason we use __GFP_ZERO
1198 * at sock_kmalloc() time below these lines.
1200 WRITE_ONCE(key->keylen, newkeylen);
1205 md5sig = rcu_dereference_protected(tp->md5sig_info,
1206 lockdep_sock_is_held(sk));
1208 md5sig = kmalloc(sizeof(*md5sig), gfp);
1213 INIT_HLIST_HEAD(&md5sig->head);
1214 rcu_assign_pointer(tp->md5sig_info, md5sig);
1217 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1220 if (!tcp_alloc_md5sig_pool()) {
1221 sock_kfree_s(sk, key, sizeof(*key));
1225 memcpy(key->key, newkey, newkeylen);
1226 key->keylen = newkeylen;
1227 key->family = family;
1228 key->prefixlen = prefixlen;
1229 key->l3index = l3index;
1231 memcpy(&key->addr, addr,
1232 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1233 sizeof(struct in_addr));
1234 hlist_add_head_rcu(&key->node, &md5sig->head);
1237 EXPORT_SYMBOL(tcp_md5_do_add);
1239 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1240 u8 prefixlen, int l3index, u8 flags)
1242 struct tcp_md5sig_key *key;
1244 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1247 hlist_del_rcu(&key->node);
1248 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1249 kfree_rcu(key, rcu);
1252 EXPORT_SYMBOL(tcp_md5_do_del);
1254 static void tcp_clear_md5_list(struct sock *sk)
1256 struct tcp_sock *tp = tcp_sk(sk);
1257 struct tcp_md5sig_key *key;
1258 struct hlist_node *n;
1259 struct tcp_md5sig_info *md5sig;
1261 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1263 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1264 hlist_del_rcu(&key->node);
1265 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1266 kfree_rcu(key, rcu);
1270 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1271 sockptr_t optval, int optlen)
1273 struct tcp_md5sig cmd;
1274 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1275 const union tcp_md5_addr *addr;
1280 if (optlen < sizeof(cmd))
1283 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1286 if (sin->sin_family != AF_INET)
1289 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1291 if (optname == TCP_MD5SIG_EXT &&
1292 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1293 prefixlen = cmd.tcpm_prefixlen;
1298 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1299 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1300 struct net_device *dev;
1303 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1304 if (dev && netif_is_l3_master(dev))
1305 l3index = dev->ifindex;
1309 /* ok to reference set/not set outside of rcu;
1310 * right now device MUST be an L3 master
1312 if (!dev || !l3index)
1316 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1318 if (!cmd.tcpm_keylen)
1319 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1321 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1324 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1325 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1328 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1329 __be32 daddr, __be32 saddr,
1330 const struct tcphdr *th, int nbytes)
1332 struct tcp4_pseudohdr *bp;
1333 struct scatterlist sg;
1340 bp->protocol = IPPROTO_TCP;
1341 bp->len = cpu_to_be16(nbytes);
1343 _th = (struct tcphdr *)(bp + 1);
1344 memcpy(_th, th, sizeof(*th));
1347 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1348 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1349 sizeof(*bp) + sizeof(*th));
1350 return crypto_ahash_update(hp->md5_req);
1353 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1354 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1356 struct tcp_md5sig_pool *hp;
1357 struct ahash_request *req;
1359 hp = tcp_get_md5sig_pool();
1361 goto clear_hash_noput;
1364 if (crypto_ahash_init(req))
1366 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1368 if (tcp_md5_hash_key(hp, key))
1370 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1371 if (crypto_ahash_final(req))
1374 tcp_put_md5sig_pool();
1378 tcp_put_md5sig_pool();
1380 memset(md5_hash, 0, 16);
1384 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1385 const struct sock *sk,
1386 const struct sk_buff *skb)
1388 struct tcp_md5sig_pool *hp;
1389 struct ahash_request *req;
1390 const struct tcphdr *th = tcp_hdr(skb);
1391 __be32 saddr, daddr;
1393 if (sk) { /* valid for establish/request sockets */
1394 saddr = sk->sk_rcv_saddr;
1395 daddr = sk->sk_daddr;
1397 const struct iphdr *iph = ip_hdr(skb);
1402 hp = tcp_get_md5sig_pool();
1404 goto clear_hash_noput;
1407 if (crypto_ahash_init(req))
1410 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1412 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1414 if (tcp_md5_hash_key(hp, key))
1416 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1417 if (crypto_ahash_final(req))
1420 tcp_put_md5sig_pool();
1424 tcp_put_md5sig_pool();
1426 memset(md5_hash, 0, 16);
1429 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1433 static void tcp_v4_init_req(struct request_sock *req,
1434 const struct sock *sk_listener,
1435 struct sk_buff *skb)
1437 struct inet_request_sock *ireq = inet_rsk(req);
1438 struct net *net = sock_net(sk_listener);
1440 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1441 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1442 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1445 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1446 struct sk_buff *skb,
1448 struct request_sock *req)
1450 tcp_v4_init_req(req, sk, skb);
1452 if (security_inet_conn_request(sk, skb, req))
1455 return inet_csk_route_req(sk, &fl->u.ip4, req);
1458 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1460 .obj_size = sizeof(struct tcp_request_sock),
1461 .rtx_syn_ack = tcp_rtx_synack,
1462 .send_ack = tcp_v4_reqsk_send_ack,
1463 .destructor = tcp_v4_reqsk_destructor,
1464 .send_reset = tcp_v4_send_reset,
1465 .syn_ack_timeout = tcp_syn_ack_timeout,
1468 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1469 .mss_clamp = TCP_MSS_DEFAULT,
1470 #ifdef CONFIG_TCP_MD5SIG
1471 .req_md5_lookup = tcp_v4_md5_lookup,
1472 .calc_md5_hash = tcp_v4_md5_hash_skb,
1474 #ifdef CONFIG_SYN_COOKIES
1475 .cookie_init_seq = cookie_v4_init_sequence,
1477 .route_req = tcp_v4_route_req,
1478 .init_seq = tcp_v4_init_seq,
1479 .init_ts_off = tcp_v4_init_ts_off,
1480 .send_synack = tcp_v4_send_synack,
1483 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1485 /* Never answer to SYNs send to broadcast or multicast */
1486 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1489 return tcp_conn_request(&tcp_request_sock_ops,
1490 &tcp_request_sock_ipv4_ops, sk, skb);
1496 EXPORT_SYMBOL(tcp_v4_conn_request);
1500 * The three way handshake has completed - we got a valid synack -
1501 * now create the new socket.
1503 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1504 struct request_sock *req,
1505 struct dst_entry *dst,
1506 struct request_sock *req_unhash,
1509 struct inet_request_sock *ireq;
1510 bool found_dup_sk = false;
1511 struct inet_sock *newinet;
1512 struct tcp_sock *newtp;
1514 #ifdef CONFIG_TCP_MD5SIG
1515 const union tcp_md5_addr *addr;
1516 struct tcp_md5sig_key *key;
1519 struct ip_options_rcu *inet_opt;
1521 if (sk_acceptq_is_full(sk))
1524 newsk = tcp_create_openreq_child(sk, req, skb);
1528 newsk->sk_gso_type = SKB_GSO_TCPV4;
1529 inet_sk_rx_dst_set(newsk, skb);
1531 newtp = tcp_sk(newsk);
1532 newinet = inet_sk(newsk);
1533 ireq = inet_rsk(req);
1534 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1535 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1536 newsk->sk_bound_dev_if = ireq->ir_iif;
1537 newinet->inet_saddr = ireq->ir_loc_addr;
1538 inet_opt = rcu_dereference(ireq->ireq_opt);
1539 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1540 newinet->mc_index = inet_iif(skb);
1541 newinet->mc_ttl = ip_hdr(skb)->ttl;
1542 newinet->rcv_tos = ip_hdr(skb)->tos;
1543 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1545 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1546 newinet->inet_id = get_random_u16();
1548 /* Set ToS of the new socket based upon the value of incoming SYN.
1549 * ECT bits are set later in tcp_init_transfer().
1551 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1552 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1555 dst = inet_csk_route_child_sock(sk, newsk, req);
1559 /* syncookie case : see end of cookie_v4_check() */
1561 sk_setup_caps(newsk, dst);
1563 tcp_ca_openreq_child(newsk, dst);
1565 tcp_sync_mss(newsk, dst_mtu(dst));
1566 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1568 tcp_initialize_rcv_mss(newsk);
1570 #ifdef CONFIG_TCP_MD5SIG
1571 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1572 /* Copy over the MD5 key from the original socket */
1573 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1574 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1577 * We're using one, so create a matching key
1578 * on the newsk structure. If we fail to get
1579 * memory, then we end up not copying the key
1582 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1583 key->key, key->keylen, GFP_ATOMIC);
1584 sk_gso_disable(newsk);
1588 if (__inet_inherit_port(sk, newsk) < 0)
1590 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1592 if (likely(*own_req)) {
1593 tcp_move_syn(newtp, req);
1594 ireq->ireq_opt = NULL;
1596 newinet->inet_opt = NULL;
1598 if (!req_unhash && found_dup_sk) {
1599 /* This code path should only be executed in the
1600 * syncookie case only
1602 bh_unlock_sock(newsk);
1610 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1617 newinet->inet_opt = NULL;
1618 inet_csk_prepare_forced_close(newsk);
1622 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1624 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1626 #ifdef CONFIG_SYN_COOKIES
1627 const struct tcphdr *th = tcp_hdr(skb);
1630 sk = cookie_v4_check(sk, skb);
1635 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1636 struct tcphdr *th, u32 *cookie)
1639 #ifdef CONFIG_SYN_COOKIES
1640 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1641 &tcp_request_sock_ipv4_ops, sk, th);
1643 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1644 tcp_synq_overflow(sk);
1650 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1652 /* The socket must have it's spinlock held when we get
1653 * here, unless it is a TCP_LISTEN socket.
1655 * We have a potential double-lock case here, so even when
1656 * doing backlog processing we use the BH locking scheme.
1657 * This is because we cannot sleep with the original spinlock
1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1662 enum skb_drop_reason reason;
1665 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1666 struct dst_entry *dst;
1668 dst = rcu_dereference_protected(sk->sk_rx_dst,
1669 lockdep_sock_is_held(sk));
1671 sock_rps_save_rxhash(sk, skb);
1672 sk_mark_napi_id(sk, skb);
1674 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1675 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1677 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1681 tcp_rcv_established(sk, skb);
1685 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1686 if (tcp_checksum_complete(skb))
1689 if (sk->sk_state == TCP_LISTEN) {
1690 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1695 if (tcp_child_process(sk, nsk, skb)) {
1702 sock_rps_save_rxhash(sk, skb);
1704 if (tcp_rcv_state_process(sk, skb)) {
1711 tcp_v4_send_reset(rsk, skb);
1713 kfree_skb_reason(skb, reason);
1714 /* Be careful here. If this function gets more complicated and
1715 * gcc suffers from register pressure on the x86, sk (in %ebx)
1716 * might be destroyed here. This current version compiles correctly,
1717 * but you have been warned.
1722 reason = SKB_DROP_REASON_TCP_CSUM;
1723 trace_tcp_bad_csum(skb);
1724 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1725 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1728 EXPORT_SYMBOL(tcp_v4_do_rcv);
1730 int tcp_v4_early_demux(struct sk_buff *skb)
1732 struct net *net = dev_net(skb->dev);
1733 const struct iphdr *iph;
1734 const struct tcphdr *th;
1737 if (skb->pkt_type != PACKET_HOST)
1740 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1746 if (th->doff < sizeof(struct tcphdr) / 4)
1749 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1750 iph->saddr, th->source,
1751 iph->daddr, ntohs(th->dest),
1752 skb->skb_iif, inet_sdif(skb));
1755 skb->destructor = sock_edemux;
1756 if (sk_fullsock(sk)) {
1757 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1760 dst = dst_check(dst, 0);
1762 sk->sk_rx_dst_ifindex == skb->skb_iif)
1763 skb_dst_set_noref(skb, dst);
1769 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1770 enum skb_drop_reason *reason)
1772 u32 limit, tail_gso_size, tail_gso_segs;
1773 struct skb_shared_info *shinfo;
1774 const struct tcphdr *th;
1775 struct tcphdr *thtail;
1776 struct sk_buff *tail;
1777 unsigned int hdrlen;
1783 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1784 * we can fix skb->truesize to its real value to avoid future drops.
1785 * This is valid because skb is not yet charged to the socket.
1786 * It has been noticed pure SACK packets were sometimes dropped
1787 * (if cooked by drivers without copybreak feature).
1793 if (unlikely(tcp_checksum_complete(skb))) {
1795 trace_tcp_bad_csum(skb);
1796 *reason = SKB_DROP_REASON_TCP_CSUM;
1797 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1798 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1802 /* Attempt coalescing to last skb in backlog, even if we are
1804 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1806 th = (const struct tcphdr *)skb->data;
1807 hdrlen = th->doff * 4;
1809 tail = sk->sk_backlog.tail;
1812 thtail = (struct tcphdr *)tail->data;
1814 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1815 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1816 ((TCP_SKB_CB(tail)->tcp_flags |
1817 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1818 !((TCP_SKB_CB(tail)->tcp_flags &
1819 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1820 ((TCP_SKB_CB(tail)->tcp_flags ^
1821 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1822 #ifdef CONFIG_TLS_DEVICE
1823 tail->decrypted != skb->decrypted ||
1825 thtail->doff != th->doff ||
1826 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1829 __skb_pull(skb, hdrlen);
1831 shinfo = skb_shinfo(skb);
1832 gso_size = shinfo->gso_size ?: skb->len;
1833 gso_segs = shinfo->gso_segs ?: 1;
1835 shinfo = skb_shinfo(tail);
1836 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1837 tail_gso_segs = shinfo->gso_segs ?: 1;
1839 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1840 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1842 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1843 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1844 thtail->window = th->window;
1847 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1848 * thtail->fin, so that the fast path in tcp_rcv_established()
1849 * is not entered if we append a packet with a FIN.
1850 * SYN, RST, URG are not present.
1851 * ACK is set on both packets.
1852 * PSH : we do not really care in TCP stack,
1853 * at least for 'GRO' packets.
1855 thtail->fin |= th->fin;
1856 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1858 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1859 TCP_SKB_CB(tail)->has_rxtstamp = true;
1860 tail->tstamp = skb->tstamp;
1861 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1864 /* Not as strict as GRO. We only need to carry mss max value */
1865 shinfo->gso_size = max(gso_size, tail_gso_size);
1866 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1868 sk->sk_backlog.len += delta;
1869 __NET_INC_STATS(sock_net(sk),
1870 LINUX_MIB_TCPBACKLOGCOALESCE);
1871 kfree_skb_partial(skb, fragstolen);
1874 __skb_push(skb, hdrlen);
1877 /* Only socket owner can try to collapse/prune rx queues
1878 * to reduce memory overhead, so add a little headroom here.
1879 * Few sockets backlog are possibly concurrently non empty.
1881 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1883 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1885 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1886 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1891 EXPORT_SYMBOL(tcp_add_backlog);
1893 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1895 struct tcphdr *th = (struct tcphdr *)skb->data;
1897 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1899 EXPORT_SYMBOL(tcp_filter);
1901 static void tcp_v4_restore_cb(struct sk_buff *skb)
1903 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1904 sizeof(struct inet_skb_parm));
1907 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1908 const struct tcphdr *th)
1910 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1911 * barrier() makes sure compiler wont play fool^Waliasing games.
1913 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1914 sizeof(struct inet_skb_parm));
1917 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1918 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1919 skb->len - th->doff * 4);
1920 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1921 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1922 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1923 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1924 TCP_SKB_CB(skb)->sacked = 0;
1925 TCP_SKB_CB(skb)->has_rxtstamp =
1926 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1933 int tcp_v4_rcv(struct sk_buff *skb)
1935 struct net *net = dev_net(skb->dev);
1936 enum skb_drop_reason drop_reason;
1937 int sdif = inet_sdif(skb);
1938 int dif = inet_iif(skb);
1939 const struct iphdr *iph;
1940 const struct tcphdr *th;
1945 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1946 if (skb->pkt_type != PACKET_HOST)
1949 /* Count it even if it's bad */
1950 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1952 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1955 th = (const struct tcphdr *)skb->data;
1957 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1958 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1961 if (!pskb_may_pull(skb, th->doff * 4))
1964 /* An explanation is required here, I think.
1965 * Packet length and doff are validated by header prediction,
1966 * provided case of th->doff==0 is eliminated.
1967 * So, we defer the checks. */
1969 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1972 th = (const struct tcphdr *)skb->data;
1975 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1976 skb, __tcp_hdrlen(th), th->source,
1977 th->dest, sdif, &refcounted);
1982 if (sk->sk_state == TCP_TIME_WAIT)
1985 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1986 struct request_sock *req = inet_reqsk(sk);
1987 bool req_stolen = false;
1990 sk = req->rsk_listener;
1991 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1992 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1994 drop_reason = tcp_inbound_md5_hash(sk, skb,
1995 &iph->saddr, &iph->daddr,
1996 AF_INET, dif, sdif);
1997 if (unlikely(drop_reason)) {
1998 sk_drops_add(sk, skb);
2002 if (tcp_checksum_complete(skb)) {
2006 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2007 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2009 inet_csk_reqsk_queue_drop_and_put(sk, req);
2013 /* reuseport_migrate_sock() has already held one sk_refcnt
2017 /* We own a reference on the listener, increase it again
2018 * as we might lose it too soon.
2024 if (!tcp_filter(sk, skb)) {
2025 th = (const struct tcphdr *)skb->data;
2027 tcp_v4_fill_cb(skb, iph, th);
2028 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2030 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2035 /* Another cpu got exclusive access to req
2036 * and created a full blown socket.
2037 * Try to feed this packet to this socket
2038 * instead of discarding it.
2040 tcp_v4_restore_cb(skb);
2044 goto discard_and_relse;
2049 tcp_v4_restore_cb(skb);
2050 } else if (tcp_child_process(sk, nsk, skb)) {
2051 tcp_v4_send_reset(nsk, skb);
2052 goto discard_and_relse;
2059 if (static_branch_unlikely(&ip4_min_ttl)) {
2060 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2061 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2062 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2063 goto discard_and_relse;
2067 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2068 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2069 goto discard_and_relse;
2072 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2073 &iph->daddr, AF_INET, dif, sdif);
2075 goto discard_and_relse;
2079 if (tcp_filter(sk, skb)) {
2080 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2081 goto discard_and_relse;
2083 th = (const struct tcphdr *)skb->data;
2085 tcp_v4_fill_cb(skb, iph, th);
2089 if (sk->sk_state == TCP_LISTEN) {
2090 ret = tcp_v4_do_rcv(sk, skb);
2091 goto put_and_return;
2094 sk_incoming_cpu_update(sk);
2096 bh_lock_sock_nested(sk);
2097 tcp_segs_in(tcp_sk(sk), skb);
2099 if (!sock_owned_by_user(sk)) {
2100 ret = tcp_v4_do_rcv(sk, skb);
2102 if (tcp_add_backlog(sk, skb, &drop_reason))
2103 goto discard_and_relse;
2114 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2115 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2118 tcp_v4_fill_cb(skb, iph, th);
2120 if (tcp_checksum_complete(skb)) {
2122 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2123 trace_tcp_bad_csum(skb);
2124 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2126 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2128 tcp_v4_send_reset(NULL, skb);
2132 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2133 /* Discard frame. */
2134 kfree_skb_reason(skb, drop_reason);
2138 sk_drops_add(sk, skb);
2144 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2145 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2146 inet_twsk_put(inet_twsk(sk));
2150 tcp_v4_fill_cb(skb, iph, th);
2152 if (tcp_checksum_complete(skb)) {
2153 inet_twsk_put(inet_twsk(sk));
2156 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2158 struct sock *sk2 = inet_lookup_listener(net,
2159 net->ipv4.tcp_death_row.hashinfo,
2160 skb, __tcp_hdrlen(th),
2161 iph->saddr, th->source,
2162 iph->daddr, th->dest,
2166 inet_twsk_deschedule_put(inet_twsk(sk));
2168 tcp_v4_restore_cb(skb);
2176 tcp_v4_timewait_ack(sk, skb);
2179 tcp_v4_send_reset(sk, skb);
2180 inet_twsk_deschedule_put(inet_twsk(sk));
2182 case TCP_TW_SUCCESS:;
2187 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2188 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2189 .twsk_unique = tcp_twsk_unique,
2190 .twsk_destructor= tcp_twsk_destructor,
2193 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2195 struct dst_entry *dst = skb_dst(skb);
2197 if (dst && dst_hold_safe(dst)) {
2198 rcu_assign_pointer(sk->sk_rx_dst, dst);
2199 sk->sk_rx_dst_ifindex = skb->skb_iif;
2202 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2204 const struct inet_connection_sock_af_ops ipv4_specific = {
2205 .queue_xmit = ip_queue_xmit,
2206 .send_check = tcp_v4_send_check,
2207 .rebuild_header = inet_sk_rebuild_header,
2208 .sk_rx_dst_set = inet_sk_rx_dst_set,
2209 .conn_request = tcp_v4_conn_request,
2210 .syn_recv_sock = tcp_v4_syn_recv_sock,
2211 .net_header_len = sizeof(struct iphdr),
2212 .setsockopt = ip_setsockopt,
2213 .getsockopt = ip_getsockopt,
2214 .addr2sockaddr = inet_csk_addr2sockaddr,
2215 .sockaddr_len = sizeof(struct sockaddr_in),
2216 .mtu_reduced = tcp_v4_mtu_reduced,
2218 EXPORT_SYMBOL(ipv4_specific);
2220 #ifdef CONFIG_TCP_MD5SIG
2221 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2222 .md5_lookup = tcp_v4_md5_lookup,
2223 .calc_md5_hash = tcp_v4_md5_hash_skb,
2224 .md5_parse = tcp_v4_parse_md5_keys,
2228 /* NOTE: A lot of things set to zero explicitly by call to
2229 * sk_alloc() so need not be done here.
2231 static int tcp_v4_init_sock(struct sock *sk)
2233 struct inet_connection_sock *icsk = inet_csk(sk);
2237 icsk->icsk_af_ops = &ipv4_specific;
2239 #ifdef CONFIG_TCP_MD5SIG
2240 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2246 void tcp_v4_destroy_sock(struct sock *sk)
2248 struct tcp_sock *tp = tcp_sk(sk);
2250 trace_tcp_destroy_sock(sk);
2252 tcp_clear_xmit_timers(sk);
2254 tcp_cleanup_congestion_control(sk);
2256 tcp_cleanup_ulp(sk);
2258 /* Cleanup up the write buffer. */
2259 tcp_write_queue_purge(sk);
2261 /* Check if we want to disable active TFO */
2262 tcp_fastopen_active_disable_ofo_check(sk);
2264 /* Cleans up our, hopefully empty, out_of_order_queue. */
2265 skb_rbtree_purge(&tp->out_of_order_queue);
2267 #ifdef CONFIG_TCP_MD5SIG
2268 /* Clean up the MD5 key list, if any */
2269 if (tp->md5sig_info) {
2270 tcp_clear_md5_list(sk);
2271 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2272 tp->md5sig_info = NULL;
2276 /* Clean up a referenced TCP bind bucket. */
2277 if (inet_csk(sk)->icsk_bind_hash)
2280 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2282 /* If socket is aborted during connect operation */
2283 tcp_free_fastopen_req(tp);
2284 tcp_fastopen_destroy_cipher(sk);
2285 tcp_saved_syn_free(tp);
2287 sk_sockets_allocated_dec(sk);
2289 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2291 #ifdef CONFIG_PROC_FS
2292 /* Proc filesystem TCP sock list dumping. */
2294 static unsigned short seq_file_family(const struct seq_file *seq);
2296 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2298 unsigned short family = seq_file_family(seq);
2300 /* AF_UNSPEC is used as a match all */
2301 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2302 net_eq(sock_net(sk), seq_file_net(seq)));
2305 /* Find a non empty bucket (starting from st->bucket)
2306 * and return the first sk from it.
2308 static void *listening_get_first(struct seq_file *seq)
2310 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2311 struct tcp_iter_state *st = seq->private;
2314 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2315 struct inet_listen_hashbucket *ilb2;
2316 struct hlist_nulls_node *node;
2319 ilb2 = &hinfo->lhash2[st->bucket];
2320 if (hlist_nulls_empty(&ilb2->nulls_head))
2323 spin_lock(&ilb2->lock);
2324 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2325 if (seq_sk_match(seq, sk))
2328 spin_unlock(&ilb2->lock);
2334 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2335 * If "cur" is the last one in the st->bucket,
2336 * call listening_get_first() to return the first sk of the next
2339 static void *listening_get_next(struct seq_file *seq, void *cur)
2341 struct tcp_iter_state *st = seq->private;
2342 struct inet_listen_hashbucket *ilb2;
2343 struct hlist_nulls_node *node;
2344 struct inet_hashinfo *hinfo;
2345 struct sock *sk = cur;
2350 sk = sk_nulls_next(sk);
2351 sk_nulls_for_each_from(sk, node) {
2352 if (seq_sk_match(seq, sk))
2356 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2357 ilb2 = &hinfo->lhash2[st->bucket];
2358 spin_unlock(&ilb2->lock);
2360 return listening_get_first(seq);
2363 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2365 struct tcp_iter_state *st = seq->private;
2370 rc = listening_get_first(seq);
2372 while (rc && *pos) {
2373 rc = listening_get_next(seq, rc);
2379 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2380 const struct tcp_iter_state *st)
2382 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2386 * Get first established socket starting from bucket given in st->bucket.
2387 * If st->bucket is zero, the very first socket in the hash is returned.
2389 static void *established_get_first(struct seq_file *seq)
2391 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2392 struct tcp_iter_state *st = seq->private;
2395 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2397 struct hlist_nulls_node *node;
2398 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2400 /* Lockless fast path for the common case of empty buckets */
2401 if (empty_bucket(hinfo, st))
2405 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2406 if (seq_sk_match(seq, sk))
2409 spin_unlock_bh(lock);
2415 static void *established_get_next(struct seq_file *seq, void *cur)
2417 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2418 struct tcp_iter_state *st = seq->private;
2419 struct hlist_nulls_node *node;
2420 struct sock *sk = cur;
2425 sk = sk_nulls_next(sk);
2427 sk_nulls_for_each_from(sk, node) {
2428 if (seq_sk_match(seq, sk))
2432 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2434 return established_get_first(seq);
2437 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2439 struct tcp_iter_state *st = seq->private;
2443 rc = established_get_first(seq);
2446 rc = established_get_next(seq, rc);
2452 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2455 struct tcp_iter_state *st = seq->private;
2457 st->state = TCP_SEQ_STATE_LISTENING;
2458 rc = listening_get_idx(seq, &pos);
2461 st->state = TCP_SEQ_STATE_ESTABLISHED;
2462 rc = established_get_idx(seq, pos);
2468 static void *tcp_seek_last_pos(struct seq_file *seq)
2470 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2471 struct tcp_iter_state *st = seq->private;
2472 int bucket = st->bucket;
2473 int offset = st->offset;
2474 int orig_num = st->num;
2477 switch (st->state) {
2478 case TCP_SEQ_STATE_LISTENING:
2479 if (st->bucket > hinfo->lhash2_mask)
2481 st->state = TCP_SEQ_STATE_LISTENING;
2482 rc = listening_get_first(seq);
2483 while (offset-- && rc && bucket == st->bucket)
2484 rc = listening_get_next(seq, rc);
2488 st->state = TCP_SEQ_STATE_ESTABLISHED;
2490 case TCP_SEQ_STATE_ESTABLISHED:
2491 if (st->bucket > hinfo->ehash_mask)
2493 rc = established_get_first(seq);
2494 while (offset-- && rc && bucket == st->bucket)
2495 rc = established_get_next(seq, rc);
2503 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2505 struct tcp_iter_state *st = seq->private;
2508 if (*pos && *pos == st->last_pos) {
2509 rc = tcp_seek_last_pos(seq);
2514 st->state = TCP_SEQ_STATE_LISTENING;
2518 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2521 st->last_pos = *pos;
2524 EXPORT_SYMBOL(tcp_seq_start);
2526 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2528 struct tcp_iter_state *st = seq->private;
2531 if (v == SEQ_START_TOKEN) {
2532 rc = tcp_get_idx(seq, 0);
2536 switch (st->state) {
2537 case TCP_SEQ_STATE_LISTENING:
2538 rc = listening_get_next(seq, v);
2540 st->state = TCP_SEQ_STATE_ESTABLISHED;
2543 rc = established_get_first(seq);
2546 case TCP_SEQ_STATE_ESTABLISHED:
2547 rc = established_get_next(seq, v);
2552 st->last_pos = *pos;
2555 EXPORT_SYMBOL(tcp_seq_next);
2557 void tcp_seq_stop(struct seq_file *seq, void *v)
2559 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2560 struct tcp_iter_state *st = seq->private;
2562 switch (st->state) {
2563 case TCP_SEQ_STATE_LISTENING:
2564 if (v != SEQ_START_TOKEN)
2565 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2567 case TCP_SEQ_STATE_ESTABLISHED:
2569 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2573 EXPORT_SYMBOL(tcp_seq_stop);
2575 static void get_openreq4(const struct request_sock *req,
2576 struct seq_file *f, int i)
2578 const struct inet_request_sock *ireq = inet_rsk(req);
2579 long delta = req->rsk_timer.expires - jiffies;
2581 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2582 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2587 ntohs(ireq->ir_rmt_port),
2589 0, 0, /* could print option size, but that is af dependent. */
2590 1, /* timers active (only the expire timer) */
2591 jiffies_delta_to_clock_t(delta),
2593 from_kuid_munged(seq_user_ns(f),
2594 sock_i_uid(req->rsk_listener)),
2595 0, /* non standard timer */
2596 0, /* open_requests have no inode */
2601 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2604 unsigned long timer_expires;
2605 const struct tcp_sock *tp = tcp_sk(sk);
2606 const struct inet_connection_sock *icsk = inet_csk(sk);
2607 const struct inet_sock *inet = inet_sk(sk);
2608 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2609 __be32 dest = inet->inet_daddr;
2610 __be32 src = inet->inet_rcv_saddr;
2611 __u16 destp = ntohs(inet->inet_dport);
2612 __u16 srcp = ntohs(inet->inet_sport);
2616 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2617 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2618 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2620 timer_expires = icsk->icsk_timeout;
2621 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2623 timer_expires = icsk->icsk_timeout;
2624 } else if (timer_pending(&sk->sk_timer)) {
2626 timer_expires = sk->sk_timer.expires;
2629 timer_expires = jiffies;
2632 state = inet_sk_state_load(sk);
2633 if (state == TCP_LISTEN)
2634 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2636 /* Because we don't lock the socket,
2637 * we might find a transient negative value.
2639 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2640 READ_ONCE(tp->copied_seq), 0);
2642 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2643 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2644 i, src, srcp, dest, destp, state,
2645 READ_ONCE(tp->write_seq) - tp->snd_una,
2648 jiffies_delta_to_clock_t(timer_expires - jiffies),
2649 icsk->icsk_retransmits,
2650 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2651 icsk->icsk_probes_out,
2653 refcount_read(&sk->sk_refcnt), sk,
2654 jiffies_to_clock_t(icsk->icsk_rto),
2655 jiffies_to_clock_t(icsk->icsk_ack.ato),
2656 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2658 state == TCP_LISTEN ?
2659 fastopenq->max_qlen :
2660 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2663 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2664 struct seq_file *f, int i)
2666 long delta = tw->tw_timer.expires - jiffies;
2670 dest = tw->tw_daddr;
2671 src = tw->tw_rcv_saddr;
2672 destp = ntohs(tw->tw_dport);
2673 srcp = ntohs(tw->tw_sport);
2675 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2676 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2677 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2678 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2679 refcount_read(&tw->tw_refcnt), tw);
2684 static int tcp4_seq_show(struct seq_file *seq, void *v)
2686 struct tcp_iter_state *st;
2687 struct sock *sk = v;
2689 seq_setwidth(seq, TMPSZ - 1);
2690 if (v == SEQ_START_TOKEN) {
2691 seq_puts(seq, " sl local_address rem_address st tx_queue "
2692 "rx_queue tr tm->when retrnsmt uid timeout "
2698 if (sk->sk_state == TCP_TIME_WAIT)
2699 get_timewait4_sock(v, seq, st->num);
2700 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2701 get_openreq4(v, seq, st->num);
2703 get_tcp4_sock(v, seq, st->num);
2709 #ifdef CONFIG_BPF_SYSCALL
2710 struct bpf_tcp_iter_state {
2711 struct tcp_iter_state state;
2712 unsigned int cur_sk;
2713 unsigned int end_sk;
2714 unsigned int max_sk;
2715 struct sock **batch;
2716 bool st_bucket_done;
2719 struct bpf_iter__tcp {
2720 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2721 __bpf_md_ptr(struct sock_common *, sk_common);
2722 uid_t uid __aligned(8);
2725 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2726 struct sock_common *sk_common, uid_t uid)
2728 struct bpf_iter__tcp ctx;
2730 meta->seq_num--; /* skip SEQ_START_TOKEN */
2732 ctx.sk_common = sk_common;
2734 return bpf_iter_run_prog(prog, &ctx);
2737 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2739 while (iter->cur_sk < iter->end_sk)
2740 sock_put(iter->batch[iter->cur_sk++]);
2743 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2744 unsigned int new_batch_sz)
2746 struct sock **new_batch;
2748 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2749 GFP_USER | __GFP_NOWARN);
2753 bpf_iter_tcp_put_batch(iter);
2754 kvfree(iter->batch);
2755 iter->batch = new_batch;
2756 iter->max_sk = new_batch_sz;
2761 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2762 struct sock *start_sk)
2764 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2765 struct bpf_tcp_iter_state *iter = seq->private;
2766 struct tcp_iter_state *st = &iter->state;
2767 struct hlist_nulls_node *node;
2768 unsigned int expected = 1;
2771 sock_hold(start_sk);
2772 iter->batch[iter->end_sk++] = start_sk;
2774 sk = sk_nulls_next(start_sk);
2775 sk_nulls_for_each_from(sk, node) {
2776 if (seq_sk_match(seq, sk)) {
2777 if (iter->end_sk < iter->max_sk) {
2779 iter->batch[iter->end_sk++] = sk;
2784 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2789 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2790 struct sock *start_sk)
2792 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2793 struct bpf_tcp_iter_state *iter = seq->private;
2794 struct tcp_iter_state *st = &iter->state;
2795 struct hlist_nulls_node *node;
2796 unsigned int expected = 1;
2799 sock_hold(start_sk);
2800 iter->batch[iter->end_sk++] = start_sk;
2802 sk = sk_nulls_next(start_sk);
2803 sk_nulls_for_each_from(sk, node) {
2804 if (seq_sk_match(seq, sk)) {
2805 if (iter->end_sk < iter->max_sk) {
2807 iter->batch[iter->end_sk++] = sk;
2812 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2817 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2819 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2820 struct bpf_tcp_iter_state *iter = seq->private;
2821 struct tcp_iter_state *st = &iter->state;
2822 unsigned int expected;
2823 bool resized = false;
2826 /* The st->bucket is done. Directly advance to the next
2827 * bucket instead of having the tcp_seek_last_pos() to skip
2828 * one by one in the current bucket and eventually find out
2829 * it has to advance to the next bucket.
2831 if (iter->st_bucket_done) {
2834 if (st->state == TCP_SEQ_STATE_LISTENING &&
2835 st->bucket > hinfo->lhash2_mask) {
2836 st->state = TCP_SEQ_STATE_ESTABLISHED;
2842 /* Get a new batch */
2845 iter->st_bucket_done = false;
2847 sk = tcp_seek_last_pos(seq);
2849 return NULL; /* Done */
2851 if (st->state == TCP_SEQ_STATE_LISTENING)
2852 expected = bpf_iter_tcp_listening_batch(seq, sk);
2854 expected = bpf_iter_tcp_established_batch(seq, sk);
2856 if (iter->end_sk == expected) {
2857 iter->st_bucket_done = true;
2861 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2869 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2871 /* bpf iter does not support lseek, so it always
2872 * continue from where it was stop()-ped.
2875 return bpf_iter_tcp_batch(seq);
2877 return SEQ_START_TOKEN;
2880 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2882 struct bpf_tcp_iter_state *iter = seq->private;
2883 struct tcp_iter_state *st = &iter->state;
2886 /* Whenever seq_next() is called, the iter->cur_sk is
2887 * done with seq_show(), so advance to the next sk in
2890 if (iter->cur_sk < iter->end_sk) {
2891 /* Keeping st->num consistent in tcp_iter_state.
2892 * bpf_iter_tcp does not use st->num.
2893 * meta.seq_num is used instead.
2896 /* Move st->offset to the next sk in the bucket such that
2897 * the future start() will resume at st->offset in
2898 * st->bucket. See tcp_seek_last_pos().
2901 sock_put(iter->batch[iter->cur_sk++]);
2904 if (iter->cur_sk < iter->end_sk)
2905 sk = iter->batch[iter->cur_sk];
2907 sk = bpf_iter_tcp_batch(seq);
2910 /* Keeping st->last_pos consistent in tcp_iter_state.
2911 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2913 st->last_pos = *pos;
2917 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2919 struct bpf_iter_meta meta;
2920 struct bpf_prog *prog;
2921 struct sock *sk = v;
2926 if (v == SEQ_START_TOKEN)
2929 if (sk_fullsock(sk))
2930 slow = lock_sock_fast(sk);
2932 if (unlikely(sk_unhashed(sk))) {
2937 if (sk->sk_state == TCP_TIME_WAIT) {
2939 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2940 const struct request_sock *req = v;
2942 uid = from_kuid_munged(seq_user_ns(seq),
2943 sock_i_uid(req->rsk_listener));
2945 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2949 prog = bpf_iter_get_info(&meta, false);
2950 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2953 if (sk_fullsock(sk))
2954 unlock_sock_fast(sk, slow);
2959 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2961 struct bpf_tcp_iter_state *iter = seq->private;
2962 struct bpf_iter_meta meta;
2963 struct bpf_prog *prog;
2967 prog = bpf_iter_get_info(&meta, true);
2969 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2972 if (iter->cur_sk < iter->end_sk) {
2973 bpf_iter_tcp_put_batch(iter);
2974 iter->st_bucket_done = false;
2978 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2979 .show = bpf_iter_tcp_seq_show,
2980 .start = bpf_iter_tcp_seq_start,
2981 .next = bpf_iter_tcp_seq_next,
2982 .stop = bpf_iter_tcp_seq_stop,
2985 static unsigned short seq_file_family(const struct seq_file *seq)
2987 const struct tcp_seq_afinfo *afinfo;
2989 #ifdef CONFIG_BPF_SYSCALL
2990 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
2991 if (seq->op == &bpf_iter_tcp_seq_ops)
2995 /* Iterated from proc fs */
2996 afinfo = pde_data(file_inode(seq->file));
2997 return afinfo->family;
3000 static const struct seq_operations tcp4_seq_ops = {
3001 .show = tcp4_seq_show,
3002 .start = tcp_seq_start,
3003 .next = tcp_seq_next,
3004 .stop = tcp_seq_stop,
3007 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3011 static int __net_init tcp4_proc_init_net(struct net *net)
3013 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3014 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3019 static void __net_exit tcp4_proc_exit_net(struct net *net)
3021 remove_proc_entry("tcp", net->proc_net);
3024 static struct pernet_operations tcp4_net_ops = {
3025 .init = tcp4_proc_init_net,
3026 .exit = tcp4_proc_exit_net,
3029 int __init tcp4_proc_init(void)
3031 return register_pernet_subsys(&tcp4_net_ops);
3034 void tcp4_proc_exit(void)
3036 unregister_pernet_subsys(&tcp4_net_ops);
3038 #endif /* CONFIG_PROC_FS */
3040 /* @wake is one when sk_stream_write_space() calls us.
3041 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3042 * This mimics the strategy used in sock_def_write_space().
3044 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3046 const struct tcp_sock *tp = tcp_sk(sk);
3047 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3048 READ_ONCE(tp->snd_nxt);
3050 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3052 EXPORT_SYMBOL(tcp_stream_memory_free);
3054 struct proto tcp_prot = {
3056 .owner = THIS_MODULE,
3058 .pre_connect = tcp_v4_pre_connect,
3059 .connect = tcp_v4_connect,
3060 .disconnect = tcp_disconnect,
3061 .accept = inet_csk_accept,
3063 .init = tcp_v4_init_sock,
3064 .destroy = tcp_v4_destroy_sock,
3065 .shutdown = tcp_shutdown,
3066 .setsockopt = tcp_setsockopt,
3067 .getsockopt = tcp_getsockopt,
3068 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3069 .keepalive = tcp_set_keepalive,
3070 .recvmsg = tcp_recvmsg,
3071 .sendmsg = tcp_sendmsg,
3072 .sendpage = tcp_sendpage,
3073 .backlog_rcv = tcp_v4_do_rcv,
3074 .release_cb = tcp_release_cb,
3076 .unhash = inet_unhash,
3077 .get_port = inet_csk_get_port,
3078 .put_port = inet_put_port,
3079 #ifdef CONFIG_BPF_SYSCALL
3080 .psock_update_sk_prot = tcp_bpf_update_proto,
3082 .enter_memory_pressure = tcp_enter_memory_pressure,
3083 .leave_memory_pressure = tcp_leave_memory_pressure,
3084 .stream_memory_free = tcp_stream_memory_free,
3085 .sockets_allocated = &tcp_sockets_allocated,
3086 .orphan_count = &tcp_orphan_count,
3088 .memory_allocated = &tcp_memory_allocated,
3089 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3091 .memory_pressure = &tcp_memory_pressure,
3092 .sysctl_mem = sysctl_tcp_mem,
3093 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3094 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3095 .max_header = MAX_TCP_HEADER,
3096 .obj_size = sizeof(struct tcp_sock),
3097 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3098 .twsk_prot = &tcp_timewait_sock_ops,
3099 .rsk_prot = &tcp_request_sock_ops,
3101 .no_autobind = true,
3102 .diag_destroy = tcp_abort,
3104 EXPORT_SYMBOL(tcp_prot);
3106 static void __net_exit tcp_sk_exit(struct net *net)
3108 if (net->ipv4.tcp_congestion_control)
3109 bpf_module_put(net->ipv4.tcp_congestion_control,
3110 net->ipv4.tcp_congestion_control->owner);
3113 static void __net_init tcp_set_hashinfo(struct net *net)
3115 struct inet_hashinfo *hinfo;
3116 unsigned int ehash_entries;
3117 struct net *old_net;
3119 if (net_eq(net, &init_net))
3122 old_net = current->nsproxy->net_ns;
3123 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3127 ehash_entries = roundup_pow_of_two(ehash_entries);
3128 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3130 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3131 "for a netns, fallback to the global one\n",
3134 hinfo = &tcp_hashinfo;
3135 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3138 net->ipv4.tcp_death_row.hashinfo = hinfo;
3139 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3140 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3143 static int __net_init tcp_sk_init(struct net *net)
3145 net->ipv4.sysctl_tcp_ecn = 2;
3146 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3148 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3149 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3150 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3151 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3152 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3154 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3155 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3156 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3158 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3159 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3160 net->ipv4.sysctl_tcp_syncookies = 1;
3161 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3162 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3163 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3164 net->ipv4.sysctl_tcp_orphan_retries = 0;
3165 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3166 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3167 net->ipv4.sysctl_tcp_tw_reuse = 2;
3168 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3170 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3171 tcp_set_hashinfo(net);
3173 net->ipv4.sysctl_tcp_sack = 1;
3174 net->ipv4.sysctl_tcp_window_scaling = 1;
3175 net->ipv4.sysctl_tcp_timestamps = 1;
3176 net->ipv4.sysctl_tcp_early_retrans = 3;
3177 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3178 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3179 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3180 net->ipv4.sysctl_tcp_max_reordering = 300;
3181 net->ipv4.sysctl_tcp_dsack = 1;
3182 net->ipv4.sysctl_tcp_app_win = 31;
3183 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3184 net->ipv4.sysctl_tcp_frto = 2;
3185 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3186 /* This limits the percentage of the congestion window which we
3187 * will allow a single TSO frame to consume. Building TSO frames
3188 * which are too large can cause TCP streams to be bursty.
3190 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3191 /* Default TSQ limit of 16 TSO segments */
3192 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3194 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3195 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3197 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3198 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3199 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3200 net->ipv4.sysctl_tcp_autocorking = 1;
3201 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3202 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3203 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3204 if (net != &init_net) {
3205 memcpy(net->ipv4.sysctl_tcp_rmem,
3206 init_net.ipv4.sysctl_tcp_rmem,
3207 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3208 memcpy(net->ipv4.sysctl_tcp_wmem,
3209 init_net.ipv4.sysctl_tcp_wmem,
3210 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3212 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3213 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3214 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3215 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3216 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3217 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3219 /* Reno is always built in */
3220 if (!net_eq(net, &init_net) &&
3221 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3222 init_net.ipv4.tcp_congestion_control->owner))
3223 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3225 net->ipv4.tcp_congestion_control = &tcp_reno;
3230 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3234 tcp_twsk_purge(net_exit_list, AF_INET);
3236 list_for_each_entry(net, net_exit_list, exit_list) {
3237 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3238 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3239 tcp_fastopen_ctx_destroy(net);
3243 static struct pernet_operations __net_initdata tcp_sk_ops = {
3244 .init = tcp_sk_init,
3245 .exit = tcp_sk_exit,
3246 .exit_batch = tcp_sk_exit_batch,
3249 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3250 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3251 struct sock_common *sk_common, uid_t uid)
3253 #define INIT_BATCH_SZ 16
3255 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3257 struct bpf_tcp_iter_state *iter = priv_data;
3260 err = bpf_iter_init_seq_net(priv_data, aux);
3264 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3266 bpf_iter_fini_seq_net(priv_data);
3273 static void bpf_iter_fini_tcp(void *priv_data)
3275 struct bpf_tcp_iter_state *iter = priv_data;
3277 bpf_iter_fini_seq_net(priv_data);
3278 kvfree(iter->batch);
3281 static const struct bpf_iter_seq_info tcp_seq_info = {
3282 .seq_ops = &bpf_iter_tcp_seq_ops,
3283 .init_seq_private = bpf_iter_init_tcp,
3284 .fini_seq_private = bpf_iter_fini_tcp,
3285 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3288 static const struct bpf_func_proto *
3289 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3290 const struct bpf_prog *prog)
3293 case BPF_FUNC_setsockopt:
3294 return &bpf_sk_setsockopt_proto;
3295 case BPF_FUNC_getsockopt:
3296 return &bpf_sk_getsockopt_proto;
3302 static struct bpf_iter_reg tcp_reg_info = {
3304 .ctx_arg_info_size = 1,
3306 { offsetof(struct bpf_iter__tcp, sk_common),
3307 PTR_TO_BTF_ID_OR_NULL },
3309 .get_func_proto = bpf_iter_tcp_get_func_proto,
3310 .seq_info = &tcp_seq_info,
3313 static void __init bpf_iter_register(void)
3315 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3316 if (bpf_iter_reg_target(&tcp_reg_info))
3317 pr_warn("Warning: could not register bpf iterator tcp\n");
3322 void __init tcp_v4_init(void)
3326 for_each_possible_cpu(cpu) {
3329 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3330 IPPROTO_TCP, &init_net);
3332 panic("Failed to create the TCP control socket.\n");
3333 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3335 /* Please enforce IP_DF and IPID==0 for RST and
3336 * ACK sent in SYN-RECV and TIME-WAIT state.
3338 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3340 per_cpu(ipv4_tcp_sk, cpu) = sk;
3342 if (register_pernet_subsys(&tcp_sk_ops))
3343 panic("Failed to create the TCP control socket.\n");
3345 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3346 bpf_iter_register();