1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 tcp_hdr(skb)->source);
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
122 #if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
189 if (addr_len < sizeof(struct sockaddr_in))
192 sock_owned_by_me(sk);
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
211 if (addr_len < sizeof(struct sockaddr_in))
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
223 nexthop = inet_opt->opt.faddr;
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
232 orig_sport, orig_dport, sk);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245 if (!inet_opt || !inet_opt->opt.srr)
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
288 /* OK, now commit destination to socket. */
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
293 if (likely(!tp->repair)) {
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
305 inet->inet_id = prandom_u32();
307 if (tcp_fastopen_defer_connect(sk, &err))
312 err = tcp_connect(sk);
321 * This unhashes the socket and releases the local port,
324 tcp_set_state(sk, TCP_CLOSE);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
330 EXPORT_SYMBOL(tcp_v4_connect);
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
337 void tcp_v4_mtu_reduced(struct sock *sk)
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 dst = inet_csk_update_pmtu(sk, mtu);
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
378 dst->ops->redirect(dst, sk, skb);
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
405 EXPORT_SYMBOL(tcp_req_err);
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
416 if (sock_owned_by_user(sk))
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
442 tcp_retransmit_timer(sk);
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
472 struct request_sock *fastopen;
475 struct net *net = dev_net(skb->dev);
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 if (sk->sk_state == TCP_CLOSE)
511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 fastopen = rcu_dereference(tp->fastopen_rsk);
519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 if (sk->sk_state != TCP_LISTEN &&
521 !between(seq, snd_una, tp->snd_nxt)) {
522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
528 if (!sock_owned_by_user(sk))
529 do_redirect(skb, sk);
531 case ICMP_SOURCE_QUENCH:
532 /* Just silently ignore these. */
534 case ICMP_PARAMETERPROB:
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 /* We are not interested in TCP_LISTEN and open_requests
543 * (SYN-ACKs send out by Linux are always <576bytes so
544 * they should go through unfragmented).
546 if (sk->sk_state == TCP_LISTEN)
549 WRITE_ONCE(tp->mtu_info, info);
550 if (!sock_owned_by_user(sk)) {
551 tcp_v4_mtu_reduced(sk);
553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
559 err = icmp_err_convert[code].errno;
560 /* check if this ICMP message allows revert of backoff.
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
567 case ICMP_TIME_EXCEEDED:
574 switch (sk->sk_state) {
577 /* Only in fast or simultaneous open. If a fast open socket is
578 * already accepted it is treated as a connected one below.
580 if (fastopen && !fastopen->sk)
583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
585 if (!sock_owned_by_user(sk)) {
592 sk->sk_err_soft = err;
597 /* If we've already connected we will keep trying
598 * until we time out, or the user gives up.
600 * rfc1122 4.2.3.9 allows to consider as hard errors
601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 * but it is obsoleted by pmtu discovery).
604 * Note, that in modern internet, where routing is unreliable
605 * and in each dark corner broken firewalls sit, sending random
606 * errors ordered by their masters even this two messages finally lose
607 * their original sense (even Linux sends invalid PORT_UNREACHs)
609 * Now we are in compliance with RFCs.
614 if (!sock_owned_by_user(sk) && inet->recverr) {
617 } else { /* Only an error on timeout */
618 sk->sk_err_soft = err;
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
629 struct tcphdr *th = tcp_hdr(skb);
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
639 const struct inet_sock *inet = inet_sk(sk);
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
643 EXPORT_SYMBOL(tcp_v4_send_check);
646 * This routine will send an RST to the other tcp.
648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
650 * Answer: if a packet caused RST, it is not for a socket
651 * existing in our system, if it is matched to a socket,
652 * it is just duplicate segment or bug in other side's TCP.
653 * So that we build reply only basing on parameters
654 * arrived with segment.
655 * Exception: precedence violation. We do not implement it in any case.
658 #ifdef CONFIG_TCP_MD5SIG
659 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
661 #define OPTION_BYTES sizeof(__be32)
664 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
666 const struct tcphdr *th = tcp_hdr(skb);
669 __be32 opt[OPTION_BYTES / sizeof(__be32)];
671 struct ip_reply_arg arg;
672 #ifdef CONFIG_TCP_MD5SIG
673 struct tcp_md5sig_key *key = NULL;
674 const __u8 *hash_location = NULL;
675 unsigned char newhash[16];
677 struct sock *sk1 = NULL;
679 u64 transmit_time = 0;
683 /* Never send a reset in response to a reset. */
687 /* If sk not NULL, it means we did a successful lookup and incoming
688 * route had to be correct. prequeue might have dropped our dst.
690 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
693 /* Swap the send and the receive. */
694 memset(&rep, 0, sizeof(rep));
695 rep.th.dest = th->source;
696 rep.th.source = th->dest;
697 rep.th.doff = sizeof(struct tcphdr) / 4;
701 rep.th.seq = th->ack_seq;
704 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 skb->len - (th->doff << 2));
708 memset(&arg, 0, sizeof(arg));
709 arg.iov[0].iov_base = (unsigned char *)&rep;
710 arg.iov[0].iov_len = sizeof(rep.th);
712 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713 #ifdef CONFIG_TCP_MD5SIG
715 hash_location = tcp_parse_md5sig_option(th);
716 if (sk && sk_fullsock(sk)) {
717 const union tcp_md5_addr *addr;
720 /* sdif set, means packet ingressed via a device
721 * in an L3 domain and inet_iif is set to it.
723 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726 } else if (hash_location) {
727 const union tcp_md5_addr *addr;
728 int sdif = tcp_v4_sdif(skb);
729 int dif = inet_iif(skb);
733 * active side is lost. Try to find listening socket through
734 * source port, and then find md5 key through listening socket.
735 * we are not loose security here:
736 * Incoming packet is checked with md5 hash with finding key,
737 * no RST generated if md5 hash doesn't match.
739 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
741 th->source, ip_hdr(skb)->daddr,
742 ntohs(th->source), dif, sdif);
743 /* don't send rst if it can't find key */
747 /* sdif set, means packet ingressed via a device
748 * in an L3 domain and dif is set to it.
750 l3index = sdif ? dif : 0;
751 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
757 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758 if (genhash || memcmp(hash_location, newhash, 16) != 0)
764 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
766 (TCPOPT_MD5SIG << 8) |
768 /* Update length and the length the header thinks exists */
769 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 rep.th.doff = arg.iov[0].iov_len / 4;
772 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773 key, ip_hdr(skb)->saddr,
774 ip_hdr(skb)->daddr, &rep.th);
777 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
778 if (rep.opt[0] == 0) {
779 __be32 mrst = mptcp_reset_option(skb);
783 arg.iov[0].iov_len += sizeof(mrst);
784 rep.th.doff = arg.iov[0].iov_len / 4;
788 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 ip_hdr(skb)->saddr, /* XXX */
790 arg.iov[0].iov_len, IPPROTO_TCP, 0);
791 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
794 /* When socket is gone, all binding information is lost.
795 * routing might fail in this case. No choice here, if we choose to force
796 * input interface, we will misroute in case of asymmetric route.
799 arg.bound_dev_if = sk->sk_bound_dev_if;
801 trace_tcp_send_reset(sk, skb);
804 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
805 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
807 arg.tos = ip_hdr(skb)->tos;
808 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
810 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
812 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
813 inet_twsk(sk)->tw_mark : sk->sk_mark;
814 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
815 inet_twsk(sk)->tw_priority : sk->sk_priority;
816 transmit_time = tcp_transmit_time(sk);
818 ip_send_unicast_reply(ctl_sk,
819 skb, &TCP_SKB_CB(skb)->header.h4.opt,
820 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 &arg, arg.iov[0].iov_len,
825 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
826 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
829 #ifdef CONFIG_TCP_MD5SIG
835 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
836 outside socket context is ugly, certainly. What can I do?
839 static void tcp_v4_send_ack(const struct sock *sk,
840 struct sk_buff *skb, u32 seq, u32 ack,
841 u32 win, u32 tsval, u32 tsecr, int oif,
842 struct tcp_md5sig_key *key,
843 int reply_flags, u8 tos)
845 const struct tcphdr *th = tcp_hdr(skb);
848 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
849 #ifdef CONFIG_TCP_MD5SIG
850 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
854 struct net *net = sock_net(sk);
855 struct ip_reply_arg arg;
859 memset(&rep.th, 0, sizeof(struct tcphdr));
860 memset(&arg, 0, sizeof(arg));
862 arg.iov[0].iov_base = (unsigned char *)&rep;
863 arg.iov[0].iov_len = sizeof(rep.th);
865 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
866 (TCPOPT_TIMESTAMP << 8) |
868 rep.opt[1] = htonl(tsval);
869 rep.opt[2] = htonl(tsecr);
870 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
873 /* Swap the send and the receive. */
874 rep.th.dest = th->source;
875 rep.th.source = th->dest;
876 rep.th.doff = arg.iov[0].iov_len / 4;
877 rep.th.seq = htonl(seq);
878 rep.th.ack_seq = htonl(ack);
880 rep.th.window = htons(win);
882 #ifdef CONFIG_TCP_MD5SIG
884 int offset = (tsecr) ? 3 : 0;
886 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
888 (TCPOPT_MD5SIG << 8) |
890 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
891 rep.th.doff = arg.iov[0].iov_len/4;
893 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
894 key, ip_hdr(skb)->saddr,
895 ip_hdr(skb)->daddr, &rep.th);
898 arg.flags = reply_flags;
899 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
900 ip_hdr(skb)->saddr, /* XXX */
901 arg.iov[0].iov_len, IPPROTO_TCP, 0);
902 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
904 arg.bound_dev_if = oif;
906 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
908 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
909 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
910 inet_twsk(sk)->tw_mark : sk->sk_mark;
911 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
912 inet_twsk(sk)->tw_priority : sk->sk_priority;
913 transmit_time = tcp_transmit_time(sk);
914 ip_send_unicast_reply(ctl_sk,
915 skb, &TCP_SKB_CB(skb)->header.h4.opt,
916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 &arg, arg.iov[0].iov_len,
921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
925 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
927 struct inet_timewait_sock *tw = inet_twsk(sk);
928 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
930 tcp_v4_send_ack(sk, skb,
931 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
932 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
933 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
936 tcp_twsk_md5_key(tcptw),
937 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
944 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
945 struct request_sock *req)
947 const union tcp_md5_addr *addr;
950 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
951 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
953 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
957 * The window field (SEG.WND) of every outgoing segment, with the
958 * exception of <SYN> segments, MUST be right-shifted by
959 * Rcv.Wind.Shift bits:
961 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
962 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
963 tcp_v4_send_ack(sk, skb, seq,
964 tcp_rsk(req)->rcv_nxt,
965 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
966 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
969 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
970 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
975 * Send a SYN-ACK after having received a SYN.
976 * This still operates on a request_sock only, not on a big
979 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
981 struct request_sock *req,
982 struct tcp_fastopen_cookie *foc,
983 enum tcp_synack_type synack_type,
984 struct sk_buff *syn_skb)
986 const struct inet_request_sock *ireq = inet_rsk(req);
992 /* First, grab a route. */
993 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
996 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
999 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1001 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003 (inet_sk(sk)->tos & INET_ECN_MASK) :
1006 if (!INET_ECN_is_capable(tos) &&
1007 tcp_bpf_ca_needs_ecn((struct sock *)req))
1008 tos |= INET_ECN_ECT_0;
1011 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1013 rcu_dereference(ireq->ireq_opt),
1016 err = net_xmit_eval(err);
1023 * IPv4 request_sock destructor.
1025 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1027 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1030 #ifdef CONFIG_TCP_MD5SIG
1032 * RFC2385 MD5 checksumming requires a mapping of
1033 * IP address->MD5 Key.
1034 * We need to maintain these in the sk structure.
1037 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038 EXPORT_SYMBOL(tcp_md5_needed);
1040 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1045 /* l3index always overrides non-l3index */
1046 if (old->l3index && new->l3index == 0)
1048 if (old->l3index == 0 && new->l3index)
1051 return old->prefixlen < new->prefixlen;
1054 /* Find the Key structure for an address. */
1055 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1056 const union tcp_md5_addr *addr,
1059 const struct tcp_sock *tp = tcp_sk(sk);
1060 struct tcp_md5sig_key *key;
1061 const struct tcp_md5sig_info *md5sig;
1063 struct tcp_md5sig_key *best_match = NULL;
1066 /* caller either holds rcu_read_lock() or socket lock */
1067 md5sig = rcu_dereference_check(tp->md5sig_info,
1068 lockdep_sock_is_held(sk));
1072 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1073 lockdep_sock_is_held(sk)) {
1074 if (key->family != family)
1076 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1078 if (family == AF_INET) {
1079 mask = inet_make_mask(key->prefixlen);
1080 match = (key->addr.a4.s_addr & mask) ==
1081 (addr->a4.s_addr & mask);
1082 #if IS_ENABLED(CONFIG_IPV6)
1083 } else if (family == AF_INET6) {
1084 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1091 if (match && better_md5_match(best_match, key))
1096 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1098 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1099 const union tcp_md5_addr *addr,
1100 int family, u8 prefixlen,
1101 int l3index, u8 flags)
1103 const struct tcp_sock *tp = tcp_sk(sk);
1104 struct tcp_md5sig_key *key;
1105 unsigned int size = sizeof(struct in_addr);
1106 const struct tcp_md5sig_info *md5sig;
1108 /* caller either holds rcu_read_lock() or socket lock */
1109 md5sig = rcu_dereference_check(tp->md5sig_info,
1110 lockdep_sock_is_held(sk));
1113 #if IS_ENABLED(CONFIG_IPV6)
1114 if (family == AF_INET6)
1115 size = sizeof(struct in6_addr);
1117 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1118 lockdep_sock_is_held(sk)) {
1119 if (key->family != family)
1121 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1123 if (key->l3index != l3index)
1125 if (!memcmp(&key->addr, addr, size) &&
1126 key->prefixlen == prefixlen)
1132 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1133 const struct sock *addr_sk)
1135 const union tcp_md5_addr *addr;
1138 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1139 addr_sk->sk_bound_dev_if);
1140 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1141 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1143 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1145 /* This can be called on a newly created socket, from other files */
1146 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1147 int family, u8 prefixlen, int l3index, u8 flags,
1148 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1150 /* Add Key to the list */
1151 struct tcp_md5sig_key *key;
1152 struct tcp_sock *tp = tcp_sk(sk);
1153 struct tcp_md5sig_info *md5sig;
1155 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1157 /* Pre-existing entry - just update that one.
1158 * Note that the key might be used concurrently.
1159 * data_race() is telling kcsan that we do not care of
1160 * key mismatches, since changing MD5 key on live flows
1161 * can lead to packet drops.
1163 data_race(memcpy(key->key, newkey, newkeylen));
1165 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1166 * Also note that a reader could catch new key->keylen value
1167 * but old key->key[], this is the reason we use __GFP_ZERO
1168 * at sock_kmalloc() time below these lines.
1170 WRITE_ONCE(key->keylen, newkeylen);
1175 md5sig = rcu_dereference_protected(tp->md5sig_info,
1176 lockdep_sock_is_held(sk));
1178 md5sig = kmalloc(sizeof(*md5sig), gfp);
1182 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1183 INIT_HLIST_HEAD(&md5sig->head);
1184 rcu_assign_pointer(tp->md5sig_info, md5sig);
1187 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1190 if (!tcp_alloc_md5sig_pool()) {
1191 sock_kfree_s(sk, key, sizeof(*key));
1195 memcpy(key->key, newkey, newkeylen);
1196 key->keylen = newkeylen;
1197 key->family = family;
1198 key->prefixlen = prefixlen;
1199 key->l3index = l3index;
1201 memcpy(&key->addr, addr,
1202 (family == AF_INET6) ? sizeof(struct in6_addr) :
1203 sizeof(struct in_addr));
1204 hlist_add_head_rcu(&key->node, &md5sig->head);
1207 EXPORT_SYMBOL(tcp_md5_do_add);
1209 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1210 u8 prefixlen, int l3index, u8 flags)
1212 struct tcp_md5sig_key *key;
1214 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1217 hlist_del_rcu(&key->node);
1218 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1219 kfree_rcu(key, rcu);
1222 EXPORT_SYMBOL(tcp_md5_do_del);
1224 static void tcp_clear_md5_list(struct sock *sk)
1226 struct tcp_sock *tp = tcp_sk(sk);
1227 struct tcp_md5sig_key *key;
1228 struct hlist_node *n;
1229 struct tcp_md5sig_info *md5sig;
1231 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1233 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1234 hlist_del_rcu(&key->node);
1235 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1236 kfree_rcu(key, rcu);
1240 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1241 sockptr_t optval, int optlen)
1243 struct tcp_md5sig cmd;
1244 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1245 const union tcp_md5_addr *addr;
1250 if (optlen < sizeof(cmd))
1253 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1256 if (sin->sin_family != AF_INET)
1259 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1261 if (optname == TCP_MD5SIG_EXT &&
1262 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1263 prefixlen = cmd.tcpm_prefixlen;
1268 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1269 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1270 struct net_device *dev;
1273 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1274 if (dev && netif_is_l3_master(dev))
1275 l3index = dev->ifindex;
1279 /* ok to reference set/not set outside of rcu;
1280 * right now device MUST be an L3 master
1282 if (!dev || !l3index)
1286 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1288 if (!cmd.tcpm_keylen)
1289 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1291 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1294 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1295 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1298 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1299 __be32 daddr, __be32 saddr,
1300 const struct tcphdr *th, int nbytes)
1302 struct tcp4_pseudohdr *bp;
1303 struct scatterlist sg;
1310 bp->protocol = IPPROTO_TCP;
1311 bp->len = cpu_to_be16(nbytes);
1313 _th = (struct tcphdr *)(bp + 1);
1314 memcpy(_th, th, sizeof(*th));
1317 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1318 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1319 sizeof(*bp) + sizeof(*th));
1320 return crypto_ahash_update(hp->md5_req);
1323 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1324 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1326 struct tcp_md5sig_pool *hp;
1327 struct ahash_request *req;
1329 hp = tcp_get_md5sig_pool();
1331 goto clear_hash_noput;
1334 if (crypto_ahash_init(req))
1336 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1338 if (tcp_md5_hash_key(hp, key))
1340 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1341 if (crypto_ahash_final(req))
1344 tcp_put_md5sig_pool();
1348 tcp_put_md5sig_pool();
1350 memset(md5_hash, 0, 16);
1354 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1355 const struct sock *sk,
1356 const struct sk_buff *skb)
1358 struct tcp_md5sig_pool *hp;
1359 struct ahash_request *req;
1360 const struct tcphdr *th = tcp_hdr(skb);
1361 __be32 saddr, daddr;
1363 if (sk) { /* valid for establish/request sockets */
1364 saddr = sk->sk_rcv_saddr;
1365 daddr = sk->sk_daddr;
1367 const struct iphdr *iph = ip_hdr(skb);
1372 hp = tcp_get_md5sig_pool();
1374 goto clear_hash_noput;
1377 if (crypto_ahash_init(req))
1380 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1382 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1384 if (tcp_md5_hash_key(hp, key))
1386 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1387 if (crypto_ahash_final(req))
1390 tcp_put_md5sig_pool();
1394 tcp_put_md5sig_pool();
1396 memset(md5_hash, 0, 16);
1399 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1403 /* Called with rcu_read_lock() */
1404 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1405 const struct sk_buff *skb,
1408 #ifdef CONFIG_TCP_MD5SIG
1410 * This gets called for each TCP segment that arrives
1411 * so we want to be efficient.
1412 * We have 3 drop cases:
1413 * o No MD5 hash and one expected.
1414 * o MD5 hash and we're not expecting one.
1415 * o MD5 hash and its wrong.
1417 const __u8 *hash_location = NULL;
1418 struct tcp_md5sig_key *hash_expected;
1419 const struct iphdr *iph = ip_hdr(skb);
1420 const struct tcphdr *th = tcp_hdr(skb);
1421 const union tcp_md5_addr *addr;
1422 unsigned char newhash[16];
1423 int genhash, l3index;
1425 /* sdif set, means packet ingressed via a device
1426 * in an L3 domain and dif is set to the l3mdev
1428 l3index = sdif ? dif : 0;
1430 addr = (union tcp_md5_addr *)&iph->saddr;
1431 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1432 hash_location = tcp_parse_md5sig_option(th);
1434 /* We've parsed the options - do we have a hash? */
1435 if (!hash_expected && !hash_location)
1438 if (hash_expected && !hash_location) {
1439 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1443 if (!hash_expected && hash_location) {
1444 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1448 /* Okay, so this is hash_expected and hash_location -
1449 * so we need to calculate the checksum.
1451 genhash = tcp_v4_md5_hash_skb(newhash,
1455 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1456 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1457 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1458 &iph->saddr, ntohs(th->source),
1459 &iph->daddr, ntohs(th->dest),
1460 genhash ? " tcp_v4_calc_md5_hash failed"
1469 static void tcp_v4_init_req(struct request_sock *req,
1470 const struct sock *sk_listener,
1471 struct sk_buff *skb)
1473 struct inet_request_sock *ireq = inet_rsk(req);
1474 struct net *net = sock_net(sk_listener);
1476 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1477 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1478 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1481 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1482 struct sk_buff *skb,
1484 struct request_sock *req)
1486 tcp_v4_init_req(req, sk, skb);
1488 if (security_inet_conn_request(sk, skb, req))
1491 return inet_csk_route_req(sk, &fl->u.ip4, req);
1494 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1496 .obj_size = sizeof(struct tcp_request_sock),
1497 .rtx_syn_ack = tcp_rtx_synack,
1498 .send_ack = tcp_v4_reqsk_send_ack,
1499 .destructor = tcp_v4_reqsk_destructor,
1500 .send_reset = tcp_v4_send_reset,
1501 .syn_ack_timeout = tcp_syn_ack_timeout,
1504 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1505 .mss_clamp = TCP_MSS_DEFAULT,
1506 #ifdef CONFIG_TCP_MD5SIG
1507 .req_md5_lookup = tcp_v4_md5_lookup,
1508 .calc_md5_hash = tcp_v4_md5_hash_skb,
1510 #ifdef CONFIG_SYN_COOKIES
1511 .cookie_init_seq = cookie_v4_init_sequence,
1513 .route_req = tcp_v4_route_req,
1514 .init_seq = tcp_v4_init_seq,
1515 .init_ts_off = tcp_v4_init_ts_off,
1516 .send_synack = tcp_v4_send_synack,
1519 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1521 /* Never answer to SYNs send to broadcast or multicast */
1522 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1525 return tcp_conn_request(&tcp_request_sock_ops,
1526 &tcp_request_sock_ipv4_ops, sk, skb);
1532 EXPORT_SYMBOL(tcp_v4_conn_request);
1536 * The three way handshake has completed - we got a valid synack -
1537 * now create the new socket.
1539 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1540 struct request_sock *req,
1541 struct dst_entry *dst,
1542 struct request_sock *req_unhash,
1545 struct inet_request_sock *ireq;
1546 bool found_dup_sk = false;
1547 struct inet_sock *newinet;
1548 struct tcp_sock *newtp;
1550 #ifdef CONFIG_TCP_MD5SIG
1551 const union tcp_md5_addr *addr;
1552 struct tcp_md5sig_key *key;
1555 struct ip_options_rcu *inet_opt;
1557 if (sk_acceptq_is_full(sk))
1560 newsk = tcp_create_openreq_child(sk, req, skb);
1564 newsk->sk_gso_type = SKB_GSO_TCPV4;
1565 inet_sk_rx_dst_set(newsk, skb);
1567 newtp = tcp_sk(newsk);
1568 newinet = inet_sk(newsk);
1569 ireq = inet_rsk(req);
1570 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1571 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1572 newsk->sk_bound_dev_if = ireq->ir_iif;
1573 newinet->inet_saddr = ireq->ir_loc_addr;
1574 inet_opt = rcu_dereference(ireq->ireq_opt);
1575 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1576 newinet->mc_index = inet_iif(skb);
1577 newinet->mc_ttl = ip_hdr(skb)->ttl;
1578 newinet->rcv_tos = ip_hdr(skb)->tos;
1579 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1581 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1582 newinet->inet_id = prandom_u32();
1584 /* Set ToS of the new socket based upon the value of incoming SYN.
1585 * ECT bits are set later in tcp_init_transfer().
1587 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1588 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1591 dst = inet_csk_route_child_sock(sk, newsk, req);
1595 /* syncookie case : see end of cookie_v4_check() */
1597 sk_setup_caps(newsk, dst);
1599 tcp_ca_openreq_child(newsk, dst);
1601 tcp_sync_mss(newsk, dst_mtu(dst));
1602 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1604 tcp_initialize_rcv_mss(newsk);
1606 #ifdef CONFIG_TCP_MD5SIG
1607 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1608 /* Copy over the MD5 key from the original socket */
1609 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1610 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1613 * We're using one, so create a matching key
1614 * on the newsk structure. If we fail to get
1615 * memory, then we end up not copying the key
1618 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1619 key->key, key->keylen, GFP_ATOMIC);
1620 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1624 if (__inet_inherit_port(sk, newsk) < 0)
1626 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1628 if (likely(*own_req)) {
1629 tcp_move_syn(newtp, req);
1630 ireq->ireq_opt = NULL;
1632 newinet->inet_opt = NULL;
1634 if (!req_unhash && found_dup_sk) {
1635 /* This code path should only be executed in the
1636 * syncookie case only
1638 bh_unlock_sock(newsk);
1646 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1653 newinet->inet_opt = NULL;
1654 inet_csk_prepare_forced_close(newsk);
1658 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1660 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1662 #ifdef CONFIG_SYN_COOKIES
1663 const struct tcphdr *th = tcp_hdr(skb);
1666 sk = cookie_v4_check(sk, skb);
1671 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1672 struct tcphdr *th, u32 *cookie)
1675 #ifdef CONFIG_SYN_COOKIES
1676 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1677 &tcp_request_sock_ipv4_ops, sk, th);
1679 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1680 tcp_synq_overflow(sk);
1686 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1688 /* The socket must have it's spinlock held when we get
1689 * here, unless it is a TCP_LISTEN socket.
1691 * We have a potential double-lock case here, so even when
1692 * doing backlog processing we use the BH locking scheme.
1693 * This is because we cannot sleep with the original spinlock
1696 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1700 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1701 struct dst_entry *dst;
1703 dst = rcu_dereference_protected(sk->sk_rx_dst,
1704 lockdep_sock_is_held(sk));
1706 sock_rps_save_rxhash(sk, skb);
1707 sk_mark_napi_id(sk, skb);
1709 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1710 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1712 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1716 tcp_rcv_established(sk, skb);
1720 if (tcp_checksum_complete(skb))
1723 if (sk->sk_state == TCP_LISTEN) {
1724 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1729 if (tcp_child_process(sk, nsk, skb)) {
1736 sock_rps_save_rxhash(sk, skb);
1738 if (tcp_rcv_state_process(sk, skb)) {
1745 tcp_v4_send_reset(rsk, skb);
1748 /* Be careful here. If this function gets more complicated and
1749 * gcc suffers from register pressure on the x86, sk (in %ebx)
1750 * might be destroyed here. This current version compiles correctly,
1751 * but you have been warned.
1756 trace_tcp_bad_csum(skb);
1757 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1758 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1761 EXPORT_SYMBOL(tcp_v4_do_rcv);
1763 int tcp_v4_early_demux(struct sk_buff *skb)
1765 const struct iphdr *iph;
1766 const struct tcphdr *th;
1769 if (skb->pkt_type != PACKET_HOST)
1772 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1778 if (th->doff < sizeof(struct tcphdr) / 4)
1781 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1782 iph->saddr, th->source,
1783 iph->daddr, ntohs(th->dest),
1784 skb->skb_iif, inet_sdif(skb));
1787 skb->destructor = sock_edemux;
1788 if (sk_fullsock(sk)) {
1789 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1792 dst = dst_check(dst, 0);
1794 sk->sk_rx_dst_ifindex == skb->skb_iif)
1795 skb_dst_set_noref(skb, dst);
1801 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1803 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1804 u32 tail_gso_size, tail_gso_segs;
1805 struct skb_shared_info *shinfo;
1806 const struct tcphdr *th;
1807 struct tcphdr *thtail;
1808 struct sk_buff *tail;
1809 unsigned int hdrlen;
1815 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1816 * we can fix skb->truesize to its real value to avoid future drops.
1817 * This is valid because skb is not yet charged to the socket.
1818 * It has been noticed pure SACK packets were sometimes dropped
1819 * (if cooked by drivers without copybreak feature).
1825 if (unlikely(tcp_checksum_complete(skb))) {
1827 trace_tcp_bad_csum(skb);
1828 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1829 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1833 /* Attempt coalescing to last skb in backlog, even if we are
1835 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1837 th = (const struct tcphdr *)skb->data;
1838 hdrlen = th->doff * 4;
1840 tail = sk->sk_backlog.tail;
1843 thtail = (struct tcphdr *)tail->data;
1845 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1846 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1847 ((TCP_SKB_CB(tail)->tcp_flags |
1848 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1849 !((TCP_SKB_CB(tail)->tcp_flags &
1850 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1851 ((TCP_SKB_CB(tail)->tcp_flags ^
1852 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1853 #ifdef CONFIG_TLS_DEVICE
1854 tail->decrypted != skb->decrypted ||
1856 thtail->doff != th->doff ||
1857 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1860 __skb_pull(skb, hdrlen);
1862 shinfo = skb_shinfo(skb);
1863 gso_size = shinfo->gso_size ?: skb->len;
1864 gso_segs = shinfo->gso_segs ?: 1;
1866 shinfo = skb_shinfo(tail);
1867 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1868 tail_gso_segs = shinfo->gso_segs ?: 1;
1870 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1871 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1873 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1874 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1875 thtail->window = th->window;
1878 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1879 * thtail->fin, so that the fast path in tcp_rcv_established()
1880 * is not entered if we append a packet with a FIN.
1881 * SYN, RST, URG are not present.
1882 * ACK is set on both packets.
1883 * PSH : we do not really care in TCP stack,
1884 * at least for 'GRO' packets.
1886 thtail->fin |= th->fin;
1887 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1889 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1890 TCP_SKB_CB(tail)->has_rxtstamp = true;
1891 tail->tstamp = skb->tstamp;
1892 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1895 /* Not as strict as GRO. We only need to carry mss max value */
1896 shinfo->gso_size = max(gso_size, tail_gso_size);
1897 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1899 sk->sk_backlog.len += delta;
1900 __NET_INC_STATS(sock_net(sk),
1901 LINUX_MIB_TCPBACKLOGCOALESCE);
1902 kfree_skb_partial(skb, fragstolen);
1905 __skb_push(skb, hdrlen);
1908 /* Only socket owner can try to collapse/prune rx queues
1909 * to reduce memory overhead, so add a little headroom here.
1910 * Few sockets backlog are possibly concurrently non empty.
1914 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1916 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1921 EXPORT_SYMBOL(tcp_add_backlog);
1923 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1925 struct tcphdr *th = (struct tcphdr *)skb->data;
1927 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1929 EXPORT_SYMBOL(tcp_filter);
1931 static void tcp_v4_restore_cb(struct sk_buff *skb)
1933 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1934 sizeof(struct inet_skb_parm));
1937 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1938 const struct tcphdr *th)
1940 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1941 * barrier() makes sure compiler wont play fool^Waliasing games.
1943 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1944 sizeof(struct inet_skb_parm));
1947 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1948 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1949 skb->len - th->doff * 4);
1950 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1951 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1952 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1953 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1954 TCP_SKB_CB(skb)->sacked = 0;
1955 TCP_SKB_CB(skb)->has_rxtstamp =
1956 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1963 int tcp_v4_rcv(struct sk_buff *skb)
1965 struct net *net = dev_net(skb->dev);
1966 struct sk_buff *skb_to_free;
1967 int sdif = inet_sdif(skb);
1968 int dif = inet_iif(skb);
1969 const struct iphdr *iph;
1970 const struct tcphdr *th;
1975 if (skb->pkt_type != PACKET_HOST)
1978 /* Count it even if it's bad */
1979 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1981 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1984 th = (const struct tcphdr *)skb->data;
1986 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1988 if (!pskb_may_pull(skb, th->doff * 4))
1991 /* An explanation is required here, I think.
1992 * Packet length and doff are validated by header prediction,
1993 * provided case of th->doff==0 is eliminated.
1994 * So, we defer the checks. */
1996 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1999 th = (const struct tcphdr *)skb->data;
2002 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2003 th->dest, sdif, &refcounted);
2008 if (sk->sk_state == TCP_TIME_WAIT)
2011 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2012 struct request_sock *req = inet_reqsk(sk);
2013 bool req_stolen = false;
2016 sk = req->rsk_listener;
2017 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2018 sk_drops_add(sk, skb);
2022 if (tcp_checksum_complete(skb)) {
2026 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2027 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2029 inet_csk_reqsk_queue_drop_and_put(sk, req);
2033 /* reuseport_migrate_sock() has already held one sk_refcnt
2037 /* We own a reference on the listener, increase it again
2038 * as we might lose it too soon.
2044 if (!tcp_filter(sk, skb)) {
2045 th = (const struct tcphdr *)skb->data;
2047 tcp_v4_fill_cb(skb, iph, th);
2048 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2053 /* Another cpu got exclusive access to req
2054 * and created a full blown socket.
2055 * Try to feed this packet to this socket
2056 * instead of discarding it.
2058 tcp_v4_restore_cb(skb);
2062 goto discard_and_relse;
2066 tcp_v4_restore_cb(skb);
2067 } else if (tcp_child_process(sk, nsk, skb)) {
2068 tcp_v4_send_reset(nsk, skb);
2069 goto discard_and_relse;
2075 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2076 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2077 goto discard_and_relse;
2080 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2081 goto discard_and_relse;
2083 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2084 goto discard_and_relse;
2088 if (tcp_filter(sk, skb))
2089 goto discard_and_relse;
2090 th = (const struct tcphdr *)skb->data;
2092 tcp_v4_fill_cb(skb, iph, th);
2096 if (sk->sk_state == TCP_LISTEN) {
2097 ret = tcp_v4_do_rcv(sk, skb);
2098 goto put_and_return;
2101 sk_incoming_cpu_update(sk);
2103 bh_lock_sock_nested(sk);
2104 tcp_segs_in(tcp_sk(sk), skb);
2106 if (!sock_owned_by_user(sk)) {
2107 skb_to_free = sk->sk_rx_skb_cache;
2108 sk->sk_rx_skb_cache = NULL;
2109 ret = tcp_v4_do_rcv(sk, skb);
2111 if (tcp_add_backlog(sk, skb))
2112 goto discard_and_relse;
2117 __kfree_skb(skb_to_free);
2126 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2129 tcp_v4_fill_cb(skb, iph, th);
2131 if (tcp_checksum_complete(skb)) {
2133 trace_tcp_bad_csum(skb);
2134 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2136 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2138 tcp_v4_send_reset(NULL, skb);
2142 /* Discard frame. */
2147 sk_drops_add(sk, skb);
2153 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2154 inet_twsk_put(inet_twsk(sk));
2158 tcp_v4_fill_cb(skb, iph, th);
2160 if (tcp_checksum_complete(skb)) {
2161 inet_twsk_put(inet_twsk(sk));
2164 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2166 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2169 iph->saddr, th->source,
2170 iph->daddr, th->dest,
2174 inet_twsk_deschedule_put(inet_twsk(sk));
2176 tcp_v4_restore_cb(skb);
2184 tcp_v4_timewait_ack(sk, skb);
2187 tcp_v4_send_reset(sk, skb);
2188 inet_twsk_deschedule_put(inet_twsk(sk));
2190 case TCP_TW_SUCCESS:;
2195 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2196 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2197 .twsk_unique = tcp_twsk_unique,
2198 .twsk_destructor= tcp_twsk_destructor,
2201 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2203 struct dst_entry *dst = skb_dst(skb);
2205 if (dst && dst_hold_safe(dst)) {
2206 rcu_assign_pointer(sk->sk_rx_dst, dst);
2207 sk->sk_rx_dst_ifindex = skb->skb_iif;
2210 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2212 const struct inet_connection_sock_af_ops ipv4_specific = {
2213 .queue_xmit = ip_queue_xmit,
2214 .send_check = tcp_v4_send_check,
2215 .rebuild_header = inet_sk_rebuild_header,
2216 .sk_rx_dst_set = inet_sk_rx_dst_set,
2217 .conn_request = tcp_v4_conn_request,
2218 .syn_recv_sock = tcp_v4_syn_recv_sock,
2219 .net_header_len = sizeof(struct iphdr),
2220 .setsockopt = ip_setsockopt,
2221 .getsockopt = ip_getsockopt,
2222 .addr2sockaddr = inet_csk_addr2sockaddr,
2223 .sockaddr_len = sizeof(struct sockaddr_in),
2224 .mtu_reduced = tcp_v4_mtu_reduced,
2226 EXPORT_SYMBOL(ipv4_specific);
2228 #ifdef CONFIG_TCP_MD5SIG
2229 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2230 .md5_lookup = tcp_v4_md5_lookup,
2231 .calc_md5_hash = tcp_v4_md5_hash_skb,
2232 .md5_parse = tcp_v4_parse_md5_keys,
2236 /* NOTE: A lot of things set to zero explicitly by call to
2237 * sk_alloc() so need not be done here.
2239 static int tcp_v4_init_sock(struct sock *sk)
2241 struct inet_connection_sock *icsk = inet_csk(sk);
2245 icsk->icsk_af_ops = &ipv4_specific;
2247 #ifdef CONFIG_TCP_MD5SIG
2248 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2254 void tcp_v4_destroy_sock(struct sock *sk)
2256 struct tcp_sock *tp = tcp_sk(sk);
2258 trace_tcp_destroy_sock(sk);
2260 tcp_clear_xmit_timers(sk);
2262 tcp_cleanup_congestion_control(sk);
2264 tcp_cleanup_ulp(sk);
2266 /* Cleanup up the write buffer. */
2267 tcp_write_queue_purge(sk);
2269 /* Check if we want to disable active TFO */
2270 tcp_fastopen_active_disable_ofo_check(sk);
2272 /* Cleans up our, hopefully empty, out_of_order_queue. */
2273 skb_rbtree_purge(&tp->out_of_order_queue);
2275 #ifdef CONFIG_TCP_MD5SIG
2276 /* Clean up the MD5 key list, if any */
2277 if (tp->md5sig_info) {
2278 tcp_clear_md5_list(sk);
2279 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2280 tp->md5sig_info = NULL;
2284 /* Clean up a referenced TCP bind bucket. */
2285 if (inet_csk(sk)->icsk_bind_hash)
2288 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2290 /* If socket is aborted during connect operation */
2291 tcp_free_fastopen_req(tp);
2292 tcp_fastopen_destroy_cipher(sk);
2293 tcp_saved_syn_free(tp);
2295 sk_sockets_allocated_dec(sk);
2297 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2299 #ifdef CONFIG_PROC_FS
2300 /* Proc filesystem TCP sock list dumping. */
2302 static unsigned short seq_file_family(const struct seq_file *seq);
2304 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2306 unsigned short family = seq_file_family(seq);
2308 /* AF_UNSPEC is used as a match all */
2309 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2310 net_eq(sock_net(sk), seq_file_net(seq)));
2313 /* Find a non empty bucket (starting from st->bucket)
2314 * and return the first sk from it.
2316 static void *listening_get_first(struct seq_file *seq)
2318 struct tcp_iter_state *st = seq->private;
2321 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2322 struct inet_listen_hashbucket *ilb2;
2323 struct inet_connection_sock *icsk;
2326 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2327 if (hlist_empty(&ilb2->head))
2330 spin_lock(&ilb2->lock);
2331 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2332 sk = (struct sock *)icsk;
2333 if (seq_sk_match(seq, sk))
2336 spin_unlock(&ilb2->lock);
2342 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2343 * If "cur" is the last one in the st->bucket,
2344 * call listening_get_first() to return the first sk of the next
2347 static void *listening_get_next(struct seq_file *seq, void *cur)
2349 struct tcp_iter_state *st = seq->private;
2350 struct inet_listen_hashbucket *ilb2;
2351 struct inet_connection_sock *icsk;
2352 struct sock *sk = cur;
2357 icsk = inet_csk(sk);
2358 inet_lhash2_for_each_icsk_continue(icsk) {
2359 sk = (struct sock *)icsk;
2360 if (seq_sk_match(seq, sk))
2364 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2365 spin_unlock(&ilb2->lock);
2367 return listening_get_first(seq);
2370 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2372 struct tcp_iter_state *st = seq->private;
2377 rc = listening_get_first(seq);
2379 while (rc && *pos) {
2380 rc = listening_get_next(seq, rc);
2386 static inline bool empty_bucket(const struct tcp_iter_state *st)
2388 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2392 * Get first established socket starting from bucket given in st->bucket.
2393 * If st->bucket is zero, the very first socket in the hash is returned.
2395 static void *established_get_first(struct seq_file *seq)
2397 struct tcp_iter_state *st = seq->private;
2400 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2402 struct hlist_nulls_node *node;
2403 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2405 /* Lockless fast path for the common case of empty buckets */
2406 if (empty_bucket(st))
2410 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2411 if (seq_sk_match(seq, sk))
2414 spin_unlock_bh(lock);
2420 static void *established_get_next(struct seq_file *seq, void *cur)
2422 struct sock *sk = cur;
2423 struct hlist_nulls_node *node;
2424 struct tcp_iter_state *st = seq->private;
2429 sk = sk_nulls_next(sk);
2431 sk_nulls_for_each_from(sk, node) {
2432 if (seq_sk_match(seq, sk))
2436 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2438 return established_get_first(seq);
2441 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2443 struct tcp_iter_state *st = seq->private;
2447 rc = established_get_first(seq);
2450 rc = established_get_next(seq, rc);
2456 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2459 struct tcp_iter_state *st = seq->private;
2461 st->state = TCP_SEQ_STATE_LISTENING;
2462 rc = listening_get_idx(seq, &pos);
2465 st->state = TCP_SEQ_STATE_ESTABLISHED;
2466 rc = established_get_idx(seq, pos);
2472 static void *tcp_seek_last_pos(struct seq_file *seq)
2474 struct tcp_iter_state *st = seq->private;
2475 int bucket = st->bucket;
2476 int offset = st->offset;
2477 int orig_num = st->num;
2480 switch (st->state) {
2481 case TCP_SEQ_STATE_LISTENING:
2482 if (st->bucket > tcp_hashinfo.lhash2_mask)
2484 st->state = TCP_SEQ_STATE_LISTENING;
2485 rc = listening_get_first(seq);
2486 while (offset-- && rc && bucket == st->bucket)
2487 rc = listening_get_next(seq, rc);
2491 st->state = TCP_SEQ_STATE_ESTABLISHED;
2493 case TCP_SEQ_STATE_ESTABLISHED:
2494 if (st->bucket > tcp_hashinfo.ehash_mask)
2496 rc = established_get_first(seq);
2497 while (offset-- && rc && bucket == st->bucket)
2498 rc = established_get_next(seq, rc);
2506 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2508 struct tcp_iter_state *st = seq->private;
2511 if (*pos && *pos == st->last_pos) {
2512 rc = tcp_seek_last_pos(seq);
2517 st->state = TCP_SEQ_STATE_LISTENING;
2521 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2524 st->last_pos = *pos;
2527 EXPORT_SYMBOL(tcp_seq_start);
2529 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2531 struct tcp_iter_state *st = seq->private;
2534 if (v == SEQ_START_TOKEN) {
2535 rc = tcp_get_idx(seq, 0);
2539 switch (st->state) {
2540 case TCP_SEQ_STATE_LISTENING:
2541 rc = listening_get_next(seq, v);
2543 st->state = TCP_SEQ_STATE_ESTABLISHED;
2546 rc = established_get_first(seq);
2549 case TCP_SEQ_STATE_ESTABLISHED:
2550 rc = established_get_next(seq, v);
2555 st->last_pos = *pos;
2558 EXPORT_SYMBOL(tcp_seq_next);
2560 void tcp_seq_stop(struct seq_file *seq, void *v)
2562 struct tcp_iter_state *st = seq->private;
2564 switch (st->state) {
2565 case TCP_SEQ_STATE_LISTENING:
2566 if (v != SEQ_START_TOKEN)
2567 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2569 case TCP_SEQ_STATE_ESTABLISHED:
2571 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2575 EXPORT_SYMBOL(tcp_seq_stop);
2577 static void get_openreq4(const struct request_sock *req,
2578 struct seq_file *f, int i)
2580 const struct inet_request_sock *ireq = inet_rsk(req);
2581 long delta = req->rsk_timer.expires - jiffies;
2583 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2584 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2589 ntohs(ireq->ir_rmt_port),
2591 0, 0, /* could print option size, but that is af dependent. */
2592 1, /* timers active (only the expire timer) */
2593 jiffies_delta_to_clock_t(delta),
2595 from_kuid_munged(seq_user_ns(f),
2596 sock_i_uid(req->rsk_listener)),
2597 0, /* non standard timer */
2598 0, /* open_requests have no inode */
2603 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2606 unsigned long timer_expires;
2607 const struct tcp_sock *tp = tcp_sk(sk);
2608 const struct inet_connection_sock *icsk = inet_csk(sk);
2609 const struct inet_sock *inet = inet_sk(sk);
2610 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2611 __be32 dest = inet->inet_daddr;
2612 __be32 src = inet->inet_rcv_saddr;
2613 __u16 destp = ntohs(inet->inet_dport);
2614 __u16 srcp = ntohs(inet->inet_sport);
2618 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2619 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2620 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2622 timer_expires = icsk->icsk_timeout;
2623 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2625 timer_expires = icsk->icsk_timeout;
2626 } else if (timer_pending(&sk->sk_timer)) {
2628 timer_expires = sk->sk_timer.expires;
2631 timer_expires = jiffies;
2634 state = inet_sk_state_load(sk);
2635 if (state == TCP_LISTEN)
2636 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2638 /* Because we don't lock the socket,
2639 * we might find a transient negative value.
2641 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2642 READ_ONCE(tp->copied_seq), 0);
2644 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2645 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2646 i, src, srcp, dest, destp, state,
2647 READ_ONCE(tp->write_seq) - tp->snd_una,
2650 jiffies_delta_to_clock_t(timer_expires - jiffies),
2651 icsk->icsk_retransmits,
2652 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2653 icsk->icsk_probes_out,
2655 refcount_read(&sk->sk_refcnt), sk,
2656 jiffies_to_clock_t(icsk->icsk_rto),
2657 jiffies_to_clock_t(icsk->icsk_ack.ato),
2658 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2660 state == TCP_LISTEN ?
2661 fastopenq->max_qlen :
2662 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2665 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2666 struct seq_file *f, int i)
2668 long delta = tw->tw_timer.expires - jiffies;
2672 dest = tw->tw_daddr;
2673 src = tw->tw_rcv_saddr;
2674 destp = ntohs(tw->tw_dport);
2675 srcp = ntohs(tw->tw_sport);
2677 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2678 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2679 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2680 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2681 refcount_read(&tw->tw_refcnt), tw);
2686 static int tcp4_seq_show(struct seq_file *seq, void *v)
2688 struct tcp_iter_state *st;
2689 struct sock *sk = v;
2691 seq_setwidth(seq, TMPSZ - 1);
2692 if (v == SEQ_START_TOKEN) {
2693 seq_puts(seq, " sl local_address rem_address st tx_queue "
2694 "rx_queue tr tm->when retrnsmt uid timeout "
2700 if (sk->sk_state == TCP_TIME_WAIT)
2701 get_timewait4_sock(v, seq, st->num);
2702 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2703 get_openreq4(v, seq, st->num);
2705 get_tcp4_sock(v, seq, st->num);
2711 #ifdef CONFIG_BPF_SYSCALL
2712 struct bpf_tcp_iter_state {
2713 struct tcp_iter_state state;
2714 unsigned int cur_sk;
2715 unsigned int end_sk;
2716 unsigned int max_sk;
2717 struct sock **batch;
2718 bool st_bucket_done;
2721 struct bpf_iter__tcp {
2722 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2723 __bpf_md_ptr(struct sock_common *, sk_common);
2724 uid_t uid __aligned(8);
2727 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2728 struct sock_common *sk_common, uid_t uid)
2730 struct bpf_iter__tcp ctx;
2732 meta->seq_num--; /* skip SEQ_START_TOKEN */
2734 ctx.sk_common = sk_common;
2736 return bpf_iter_run_prog(prog, &ctx);
2739 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2741 while (iter->cur_sk < iter->end_sk)
2742 sock_put(iter->batch[iter->cur_sk++]);
2745 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2746 unsigned int new_batch_sz)
2748 struct sock **new_batch;
2750 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2751 GFP_USER | __GFP_NOWARN);
2755 bpf_iter_tcp_put_batch(iter);
2756 kvfree(iter->batch);
2757 iter->batch = new_batch;
2758 iter->max_sk = new_batch_sz;
2763 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2764 struct sock *start_sk)
2766 struct bpf_tcp_iter_state *iter = seq->private;
2767 struct tcp_iter_state *st = &iter->state;
2768 struct inet_connection_sock *icsk;
2769 unsigned int expected = 1;
2772 sock_hold(start_sk);
2773 iter->batch[iter->end_sk++] = start_sk;
2775 icsk = inet_csk(start_sk);
2776 inet_lhash2_for_each_icsk_continue(icsk) {
2777 sk = (struct sock *)icsk;
2778 if (seq_sk_match(seq, sk)) {
2779 if (iter->end_sk < iter->max_sk) {
2781 iter->batch[iter->end_sk++] = sk;
2786 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2791 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2792 struct sock *start_sk)
2794 struct bpf_tcp_iter_state *iter = seq->private;
2795 struct tcp_iter_state *st = &iter->state;
2796 struct hlist_nulls_node *node;
2797 unsigned int expected = 1;
2800 sock_hold(start_sk);
2801 iter->batch[iter->end_sk++] = start_sk;
2803 sk = sk_nulls_next(start_sk);
2804 sk_nulls_for_each_from(sk, node) {
2805 if (seq_sk_match(seq, sk)) {
2806 if (iter->end_sk < iter->max_sk) {
2808 iter->batch[iter->end_sk++] = sk;
2813 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2818 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2820 struct bpf_tcp_iter_state *iter = seq->private;
2821 struct tcp_iter_state *st = &iter->state;
2822 unsigned int expected;
2823 bool resized = false;
2826 /* The st->bucket is done. Directly advance to the next
2827 * bucket instead of having the tcp_seek_last_pos() to skip
2828 * one by one in the current bucket and eventually find out
2829 * it has to advance to the next bucket.
2831 if (iter->st_bucket_done) {
2834 if (st->state == TCP_SEQ_STATE_LISTENING &&
2835 st->bucket > tcp_hashinfo.lhash2_mask) {
2836 st->state = TCP_SEQ_STATE_ESTABLISHED;
2842 /* Get a new batch */
2845 iter->st_bucket_done = false;
2847 sk = tcp_seek_last_pos(seq);
2849 return NULL; /* Done */
2851 if (st->state == TCP_SEQ_STATE_LISTENING)
2852 expected = bpf_iter_tcp_listening_batch(seq, sk);
2854 expected = bpf_iter_tcp_established_batch(seq, sk);
2856 if (iter->end_sk == expected) {
2857 iter->st_bucket_done = true;
2861 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2869 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2871 /* bpf iter does not support lseek, so it always
2872 * continue from where it was stop()-ped.
2875 return bpf_iter_tcp_batch(seq);
2877 return SEQ_START_TOKEN;
2880 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2882 struct bpf_tcp_iter_state *iter = seq->private;
2883 struct tcp_iter_state *st = &iter->state;
2886 /* Whenever seq_next() is called, the iter->cur_sk is
2887 * done with seq_show(), so advance to the next sk in
2890 if (iter->cur_sk < iter->end_sk) {
2891 /* Keeping st->num consistent in tcp_iter_state.
2892 * bpf_iter_tcp does not use st->num.
2893 * meta.seq_num is used instead.
2896 /* Move st->offset to the next sk in the bucket such that
2897 * the future start() will resume at st->offset in
2898 * st->bucket. See tcp_seek_last_pos().
2901 sock_put(iter->batch[iter->cur_sk++]);
2904 if (iter->cur_sk < iter->end_sk)
2905 sk = iter->batch[iter->cur_sk];
2907 sk = bpf_iter_tcp_batch(seq);
2910 /* Keeping st->last_pos consistent in tcp_iter_state.
2911 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2913 st->last_pos = *pos;
2917 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2919 struct bpf_iter_meta meta;
2920 struct bpf_prog *prog;
2921 struct sock *sk = v;
2926 if (v == SEQ_START_TOKEN)
2929 if (sk_fullsock(sk))
2930 slow = lock_sock_fast(sk);
2932 if (unlikely(sk_unhashed(sk))) {
2937 if (sk->sk_state == TCP_TIME_WAIT) {
2939 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2940 const struct request_sock *req = v;
2942 uid = from_kuid_munged(seq_user_ns(seq),
2943 sock_i_uid(req->rsk_listener));
2945 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2949 prog = bpf_iter_get_info(&meta, false);
2950 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2953 if (sk_fullsock(sk))
2954 unlock_sock_fast(sk, slow);
2959 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2961 struct bpf_tcp_iter_state *iter = seq->private;
2962 struct bpf_iter_meta meta;
2963 struct bpf_prog *prog;
2967 prog = bpf_iter_get_info(&meta, true);
2969 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2972 if (iter->cur_sk < iter->end_sk) {
2973 bpf_iter_tcp_put_batch(iter);
2974 iter->st_bucket_done = false;
2978 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2979 .show = bpf_iter_tcp_seq_show,
2980 .start = bpf_iter_tcp_seq_start,
2981 .next = bpf_iter_tcp_seq_next,
2982 .stop = bpf_iter_tcp_seq_stop,
2985 static unsigned short seq_file_family(const struct seq_file *seq)
2987 const struct tcp_seq_afinfo *afinfo;
2989 #ifdef CONFIG_BPF_SYSCALL
2990 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
2991 if (seq->op == &bpf_iter_tcp_seq_ops)
2995 /* Iterated from proc fs */
2996 afinfo = PDE_DATA(file_inode(seq->file));
2997 return afinfo->family;
3000 static const struct seq_operations tcp4_seq_ops = {
3001 .show = tcp4_seq_show,
3002 .start = tcp_seq_start,
3003 .next = tcp_seq_next,
3004 .stop = tcp_seq_stop,
3007 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3011 static int __net_init tcp4_proc_init_net(struct net *net)
3013 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3014 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3019 static void __net_exit tcp4_proc_exit_net(struct net *net)
3021 remove_proc_entry("tcp", net->proc_net);
3024 static struct pernet_operations tcp4_net_ops = {
3025 .init = tcp4_proc_init_net,
3026 .exit = tcp4_proc_exit_net,
3029 int __init tcp4_proc_init(void)
3031 return register_pernet_subsys(&tcp4_net_ops);
3034 void tcp4_proc_exit(void)
3036 unregister_pernet_subsys(&tcp4_net_ops);
3038 #endif /* CONFIG_PROC_FS */
3040 /* @wake is one when sk_stream_write_space() calls us.
3041 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3042 * This mimics the strategy used in sock_def_write_space().
3044 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3046 const struct tcp_sock *tp = tcp_sk(sk);
3047 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3048 READ_ONCE(tp->snd_nxt);
3050 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3052 EXPORT_SYMBOL(tcp_stream_memory_free);
3054 struct proto tcp_prot = {
3056 .owner = THIS_MODULE,
3058 .pre_connect = tcp_v4_pre_connect,
3059 .connect = tcp_v4_connect,
3060 .disconnect = tcp_disconnect,
3061 .accept = inet_csk_accept,
3063 .init = tcp_v4_init_sock,
3064 .destroy = tcp_v4_destroy_sock,
3065 .shutdown = tcp_shutdown,
3066 .setsockopt = tcp_setsockopt,
3067 .getsockopt = tcp_getsockopt,
3068 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3069 .keepalive = tcp_set_keepalive,
3070 .recvmsg = tcp_recvmsg,
3071 .sendmsg = tcp_sendmsg,
3072 .sendpage = tcp_sendpage,
3073 .backlog_rcv = tcp_v4_do_rcv,
3074 .release_cb = tcp_release_cb,
3076 .unhash = inet_unhash,
3077 .get_port = inet_csk_get_port,
3078 #ifdef CONFIG_BPF_SYSCALL
3079 .psock_update_sk_prot = tcp_bpf_update_proto,
3081 .enter_memory_pressure = tcp_enter_memory_pressure,
3082 .leave_memory_pressure = tcp_leave_memory_pressure,
3083 .stream_memory_free = tcp_stream_memory_free,
3084 .sockets_allocated = &tcp_sockets_allocated,
3085 .orphan_count = &tcp_orphan_count,
3086 .memory_allocated = &tcp_memory_allocated,
3087 .memory_pressure = &tcp_memory_pressure,
3088 .sysctl_mem = sysctl_tcp_mem,
3089 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3090 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3091 .max_header = MAX_TCP_HEADER,
3092 .obj_size = sizeof(struct tcp_sock),
3093 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3094 .twsk_prot = &tcp_timewait_sock_ops,
3095 .rsk_prot = &tcp_request_sock_ops,
3096 .h.hashinfo = &tcp_hashinfo,
3097 .no_autobind = true,
3098 .diag_destroy = tcp_abort,
3100 EXPORT_SYMBOL(tcp_prot);
3102 static void __net_exit tcp_sk_exit(struct net *net)
3106 if (net->ipv4.tcp_congestion_control)
3107 bpf_module_put(net->ipv4.tcp_congestion_control,
3108 net->ipv4.tcp_congestion_control->owner);
3110 for_each_possible_cpu(cpu)
3111 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3112 free_percpu(net->ipv4.tcp_sk);
3115 static int __net_init tcp_sk_init(struct net *net)
3119 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3120 if (!net->ipv4.tcp_sk)
3123 for_each_possible_cpu(cpu) {
3126 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3130 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3132 /* Please enforce IP_DF and IPID==0 for RST and
3133 * ACK sent in SYN-RECV and TIME-WAIT state.
3135 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3137 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3140 net->ipv4.sysctl_tcp_ecn = 2;
3141 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3143 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3144 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3145 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3146 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3147 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3149 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3150 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3151 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3153 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3154 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3155 net->ipv4.sysctl_tcp_syncookies = 1;
3156 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3157 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3158 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3159 net->ipv4.sysctl_tcp_orphan_retries = 0;
3160 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3161 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3162 net->ipv4.sysctl_tcp_tw_reuse = 2;
3163 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3165 cnt = tcp_hashinfo.ehash_mask + 1;
3166 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3167 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3169 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3170 net->ipv4.sysctl_tcp_sack = 1;
3171 net->ipv4.sysctl_tcp_window_scaling = 1;
3172 net->ipv4.sysctl_tcp_timestamps = 1;
3173 net->ipv4.sysctl_tcp_early_retrans = 3;
3174 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3175 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3176 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3177 net->ipv4.sysctl_tcp_max_reordering = 300;
3178 net->ipv4.sysctl_tcp_dsack = 1;
3179 net->ipv4.sysctl_tcp_app_win = 31;
3180 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3181 net->ipv4.sysctl_tcp_frto = 2;
3182 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3183 /* This limits the percentage of the congestion window which we
3184 * will allow a single TSO frame to consume. Building TSO frames
3185 * which are too large can cause TCP streams to be bursty.
3187 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3188 /* Default TSQ limit of 16 TSO segments */
3189 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3190 /* rfc5961 challenge ack rate limiting */
3191 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3192 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3193 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3194 net->ipv4.sysctl_tcp_autocorking = 1;
3195 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3196 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3197 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3198 if (net != &init_net) {
3199 memcpy(net->ipv4.sysctl_tcp_rmem,
3200 init_net.ipv4.sysctl_tcp_rmem,
3201 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3202 memcpy(net->ipv4.sysctl_tcp_wmem,
3203 init_net.ipv4.sysctl_tcp_wmem,
3204 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3206 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3207 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3208 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3209 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3210 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3211 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3213 /* Reno is always built in */
3214 if (!net_eq(net, &init_net) &&
3215 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3216 init_net.ipv4.tcp_congestion_control->owner))
3217 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3219 net->ipv4.tcp_congestion_control = &tcp_reno;
3228 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3232 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3234 list_for_each_entry(net, net_exit_list, exit_list)
3235 tcp_fastopen_ctx_destroy(net);
3238 static struct pernet_operations __net_initdata tcp_sk_ops = {
3239 .init = tcp_sk_init,
3240 .exit = tcp_sk_exit,
3241 .exit_batch = tcp_sk_exit_batch,
3244 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3245 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3246 struct sock_common *sk_common, uid_t uid)
3248 #define INIT_BATCH_SZ 16
3250 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3252 struct bpf_tcp_iter_state *iter = priv_data;
3255 err = bpf_iter_init_seq_net(priv_data, aux);
3259 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3261 bpf_iter_fini_seq_net(priv_data);
3268 static void bpf_iter_fini_tcp(void *priv_data)
3270 struct bpf_tcp_iter_state *iter = priv_data;
3272 bpf_iter_fini_seq_net(priv_data);
3273 kvfree(iter->batch);
3276 static const struct bpf_iter_seq_info tcp_seq_info = {
3277 .seq_ops = &bpf_iter_tcp_seq_ops,
3278 .init_seq_private = bpf_iter_init_tcp,
3279 .fini_seq_private = bpf_iter_fini_tcp,
3280 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3283 static const struct bpf_func_proto *
3284 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3285 const struct bpf_prog *prog)
3288 case BPF_FUNC_setsockopt:
3289 return &bpf_sk_setsockopt_proto;
3290 case BPF_FUNC_getsockopt:
3291 return &bpf_sk_getsockopt_proto;
3297 static struct bpf_iter_reg tcp_reg_info = {
3299 .ctx_arg_info_size = 1,
3301 { offsetof(struct bpf_iter__tcp, sk_common),
3302 PTR_TO_BTF_ID_OR_NULL },
3304 .get_func_proto = bpf_iter_tcp_get_func_proto,
3305 .seq_info = &tcp_seq_info,
3308 static void __init bpf_iter_register(void)
3310 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3311 if (bpf_iter_reg_target(&tcp_reg_info))
3312 pr_warn("Warning: could not register bpf iterator tcp\n");
3317 void __init tcp_v4_init(void)
3319 if (register_pernet_subsys(&tcp_sk_ops))
3320 panic("Failed to create the TCP control socket.\n");
3322 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3323 bpf_iter_register();