1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 tcp_hdr(skb)->source);
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 const struct inet_timewait_sock *tw = inet_twsk(sktw);
112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 struct tcp_sock *tp = tcp_sk(sk);
114 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
191 if (addr_len < sizeof(struct sockaddr_in))
194 sock_owned_by_me(sk);
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
213 if (addr_len < sizeof(struct sockaddr_in))
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
225 nexthop = inet_opt->opt.faddr;
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
234 orig_sport, orig_dport, sk);
237 if (err == -ENETUNREACH)
238 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
247 if (!inet_opt || !inet_opt->opt.srr)
250 if (!inet->inet_saddr)
251 inet->inet_saddr = fl4->saddr;
252 sk_rcv_saddr_set(sk, inet->inet_saddr);
254 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
255 /* Reset inherited state */
256 tp->rx_opt.ts_recent = 0;
257 tp->rx_opt.ts_recent_stamp = 0;
258 if (likely(!tp->repair))
259 WRITE_ONCE(tp->write_seq, 0);
262 inet->inet_dport = usin->sin_port;
263 sk_daddr_set(sk, daddr);
265 inet_csk(sk)->icsk_ext_hdr_len = 0;
267 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
269 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
271 /* Socket identity is still unknown (sport may be zero).
272 * However we set state to SYN-SENT and not releasing socket
273 * lock select source port, enter ourselves into the hash tables and
274 * complete initialization after this.
276 tcp_set_state(sk, TCP_SYN_SENT);
277 err = inet_hash_connect(tcp_death_row, sk);
283 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
284 inet->inet_sport, inet->inet_dport, sk);
290 /* OK, now commit destination to socket. */
291 sk->sk_gso_type = SKB_GSO_TCPV4;
292 sk_setup_caps(sk, &rt->dst);
295 if (likely(!tp->repair)) {
297 WRITE_ONCE(tp->write_seq,
298 secure_tcp_seq(inet->inet_saddr,
302 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
307 inet->inet_id = prandom_u32();
309 if (tcp_fastopen_defer_connect(sk, &err))
314 err = tcp_connect(sk);
323 * This unhashes the socket and releases the local port,
326 tcp_set_state(sk, TCP_CLOSE);
328 sk->sk_route_caps = 0;
329 inet->inet_dport = 0;
332 EXPORT_SYMBOL(tcp_v4_connect);
335 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
336 * It can be called through tcp_release_cb() if socket was owned by user
337 * at the time tcp_v4_err() was called to handle ICMP message.
339 void tcp_v4_mtu_reduced(struct sock *sk)
341 struct inet_sock *inet = inet_sk(sk);
342 struct dst_entry *dst;
345 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
348 dst = inet_csk_update_pmtu(sk, mtu);
352 /* Something is about to be wrong... Remember soft error
353 * for the case, if this connection will not able to recover.
355 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
356 sk->sk_err_soft = EMSGSIZE;
360 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
361 ip_sk_accept_pmtu(sk) &&
362 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
363 tcp_sync_mss(sk, mtu);
365 /* Resend the TCP packet because it's
366 * clear that the old packet has been
367 * dropped. This is the new "fast" path mtu
370 tcp_simple_retransmit(sk);
371 } /* else let the usual retransmit timer handle it */
373 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
375 static void do_redirect(struct sk_buff *skb, struct sock *sk)
377 struct dst_entry *dst = __sk_dst_check(sk, 0);
380 dst->ops->redirect(dst, sk, skb);
384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
385 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
387 struct request_sock *req = inet_reqsk(sk);
388 struct net *net = sock_net(sk);
390 /* ICMPs are not backlogged, hence we cannot get
391 * an established socket here.
393 if (seq != tcp_rsk(req)->snt_isn) {
394 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
397 * Still in SYN_RECV, just remove it silently.
398 * There is no good way to pass the error to the newly
399 * created socket, and POSIX does not want network
400 * errors returned from accept().
402 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
403 tcp_listendrop(req->rsk_listener);
407 EXPORT_SYMBOL(tcp_req_err);
409 /* TCP-LD (RFC 6069) logic */
410 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
412 struct inet_connection_sock *icsk = inet_csk(sk);
413 struct tcp_sock *tp = tcp_sk(sk);
418 if (sock_owned_by_user(sk))
421 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
425 skb = tcp_rtx_queue_head(sk);
426 if (WARN_ON_ONCE(!skb))
429 icsk->icsk_backoff--;
430 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
431 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
433 tcp_mstamp_refresh(tp);
434 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
435 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
438 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
439 remaining, TCP_RTO_MAX);
441 /* RTO revert clocked out retransmission.
442 * Will retransmit now.
444 tcp_retransmit_timer(sk);
447 EXPORT_SYMBOL(tcp_ld_RTO_revert);
450 * This routine is called by the ICMP module when it gets some
451 * sort of error condition. If err < 0 then the socket should
452 * be closed and the error returned to the user. If err > 0
453 * it's just the icmp type << 8 | icmp code. After adjustment
454 * header points to the first 8 bytes of the tcp header. We need
455 * to find the appropriate port.
457 * The locking strategy used here is very "optimistic". When
458 * someone else accesses the socket the ICMP is just dropped
459 * and for some paths there is no check at all.
460 * A more general error queue to queue errors for later handling
461 * is probably better.
465 int tcp_v4_err(struct sk_buff *skb, u32 info)
467 const struct iphdr *iph = (const struct iphdr *)skb->data;
468 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
470 struct inet_sock *inet;
471 const int type = icmp_hdr(skb)->type;
472 const int code = icmp_hdr(skb)->code;
474 struct request_sock *fastopen;
477 struct net *net = dev_net(skb->dev);
479 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
480 th->dest, iph->saddr, ntohs(th->source),
483 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
486 if (sk->sk_state == TCP_TIME_WAIT) {
487 inet_twsk_put(inet_twsk(sk));
490 seq = ntohl(th->seq);
491 if (sk->sk_state == TCP_NEW_SYN_RECV) {
492 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
493 type == ICMP_TIME_EXCEEDED ||
494 (type == ICMP_DEST_UNREACH &&
495 (code == ICMP_NET_UNREACH ||
496 code == ICMP_HOST_UNREACH)));
501 /* If too many ICMPs get dropped on busy
502 * servers this needs to be solved differently.
503 * We do take care of PMTU discovery (RFC1191) special case :
504 * we can receive locally generated ICMP messages while socket is held.
506 if (sock_owned_by_user(sk)) {
507 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
508 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
510 if (sk->sk_state == TCP_CLOSE)
513 if (static_branch_unlikely(&ip4_min_ttl)) {
514 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
515 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
516 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
522 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
523 fastopen = rcu_dereference(tp->fastopen_rsk);
524 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
525 if (sk->sk_state != TCP_LISTEN &&
526 !between(seq, snd_una, tp->snd_nxt)) {
527 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
533 if (!sock_owned_by_user(sk))
534 do_redirect(skb, sk);
536 case ICMP_SOURCE_QUENCH:
537 /* Just silently ignore these. */
539 case ICMP_PARAMETERPROB:
542 case ICMP_DEST_UNREACH:
543 if (code > NR_ICMP_UNREACH)
546 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
547 /* We are not interested in TCP_LISTEN and open_requests
548 * (SYN-ACKs send out by Linux are always <576bytes so
549 * they should go through unfragmented).
551 if (sk->sk_state == TCP_LISTEN)
554 WRITE_ONCE(tp->mtu_info, info);
555 if (!sock_owned_by_user(sk)) {
556 tcp_v4_mtu_reduced(sk);
558 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
564 err = icmp_err_convert[code].errno;
565 /* check if this ICMP message allows revert of backoff.
569 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
570 tcp_ld_RTO_revert(sk, seq);
572 case ICMP_TIME_EXCEEDED:
579 switch (sk->sk_state) {
582 /* Only in fast or simultaneous open. If a fast open socket is
583 * already accepted it is treated as a connected one below.
585 if (fastopen && !fastopen->sk)
588 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
590 if (!sock_owned_by_user(sk)) {
597 sk->sk_err_soft = err;
602 /* If we've already connected we will keep trying
603 * until we time out, or the user gives up.
605 * rfc1122 4.2.3.9 allows to consider as hard errors
606 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
607 * but it is obsoleted by pmtu discovery).
609 * Note, that in modern internet, where routing is unreliable
610 * and in each dark corner broken firewalls sit, sending random
611 * errors ordered by their masters even this two messages finally lose
612 * their original sense (even Linux sends invalid PORT_UNREACHs)
614 * Now we are in compliance with RFCs.
619 if (!sock_owned_by_user(sk) && inet->recverr) {
622 } else { /* Only an error on timeout */
623 sk->sk_err_soft = err;
632 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
634 struct tcphdr *th = tcp_hdr(skb);
636 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
637 skb->csum_start = skb_transport_header(skb) - skb->head;
638 skb->csum_offset = offsetof(struct tcphdr, check);
641 /* This routine computes an IPv4 TCP checksum. */
642 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
644 const struct inet_sock *inet = inet_sk(sk);
646 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
648 EXPORT_SYMBOL(tcp_v4_send_check);
651 * This routine will send an RST to the other tcp.
653 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
655 * Answer: if a packet caused RST, it is not for a socket
656 * existing in our system, if it is matched to a socket,
657 * it is just duplicate segment or bug in other side's TCP.
658 * So that we build reply only basing on parameters
659 * arrived with segment.
660 * Exception: precedence violation. We do not implement it in any case.
663 #ifdef CONFIG_TCP_MD5SIG
664 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
666 #define OPTION_BYTES sizeof(__be32)
669 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
671 const struct tcphdr *th = tcp_hdr(skb);
674 __be32 opt[OPTION_BYTES / sizeof(__be32)];
676 struct ip_reply_arg arg;
677 #ifdef CONFIG_TCP_MD5SIG
678 struct tcp_md5sig_key *key = NULL;
679 const __u8 *hash_location = NULL;
680 unsigned char newhash[16];
682 struct sock *sk1 = NULL;
684 u64 transmit_time = 0;
688 /* Never send a reset in response to a reset. */
692 /* If sk not NULL, it means we did a successful lookup and incoming
693 * route had to be correct. prequeue might have dropped our dst.
695 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
698 /* Swap the send and the receive. */
699 memset(&rep, 0, sizeof(rep));
700 rep.th.dest = th->source;
701 rep.th.source = th->dest;
702 rep.th.doff = sizeof(struct tcphdr) / 4;
706 rep.th.seq = th->ack_seq;
709 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
710 skb->len - (th->doff << 2));
713 memset(&arg, 0, sizeof(arg));
714 arg.iov[0].iov_base = (unsigned char *)&rep;
715 arg.iov[0].iov_len = sizeof(rep.th);
717 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
718 #ifdef CONFIG_TCP_MD5SIG
720 hash_location = tcp_parse_md5sig_option(th);
721 if (sk && sk_fullsock(sk)) {
722 const union tcp_md5_addr *addr;
725 /* sdif set, means packet ingressed via a device
726 * in an L3 domain and inet_iif is set to it.
728 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
729 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
730 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
731 } else if (hash_location) {
732 const union tcp_md5_addr *addr;
733 int sdif = tcp_v4_sdif(skb);
734 int dif = inet_iif(skb);
738 * active side is lost. Try to find listening socket through
739 * source port, and then find md5 key through listening socket.
740 * we are not loose security here:
741 * Incoming packet is checked with md5 hash with finding key,
742 * no RST generated if md5 hash doesn't match.
744 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
746 th->source, ip_hdr(skb)->daddr,
747 ntohs(th->source), dif, sdif);
748 /* don't send rst if it can't find key */
752 /* sdif set, means packet ingressed via a device
753 * in an L3 domain and dif is set to it.
755 l3index = sdif ? dif : 0;
756 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
757 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
762 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
763 if (genhash || memcmp(hash_location, newhash, 16) != 0)
769 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
771 (TCPOPT_MD5SIG << 8) |
773 /* Update length and the length the header thinks exists */
774 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
775 rep.th.doff = arg.iov[0].iov_len / 4;
777 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
778 key, ip_hdr(skb)->saddr,
779 ip_hdr(skb)->daddr, &rep.th);
782 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
783 if (rep.opt[0] == 0) {
784 __be32 mrst = mptcp_reset_option(skb);
788 arg.iov[0].iov_len += sizeof(mrst);
789 rep.th.doff = arg.iov[0].iov_len / 4;
793 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
794 ip_hdr(skb)->saddr, /* XXX */
795 arg.iov[0].iov_len, IPPROTO_TCP, 0);
796 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
797 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
799 /* When socket is gone, all binding information is lost.
800 * routing might fail in this case. No choice here, if we choose to force
801 * input interface, we will misroute in case of asymmetric route.
804 arg.bound_dev_if = sk->sk_bound_dev_if;
806 trace_tcp_send_reset(sk, skb);
809 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
810 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
812 arg.tos = ip_hdr(skb)->tos;
813 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
815 ctl_sk = this_cpu_read(ipv4_tcp_sk);
816 sock_net_set(ctl_sk, net);
818 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
819 inet_twsk(sk)->tw_mark : sk->sk_mark;
820 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
821 inet_twsk(sk)->tw_priority : sk->sk_priority;
822 transmit_time = tcp_transmit_time(sk);
824 ip_send_unicast_reply(ctl_sk,
825 skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 &arg, arg.iov[0].iov_len,
831 sock_net_set(ctl_sk, &init_net);
832 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
833 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
836 #ifdef CONFIG_TCP_MD5SIG
842 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
843 outside socket context is ugly, certainly. What can I do?
846 static void tcp_v4_send_ack(const struct sock *sk,
847 struct sk_buff *skb, u32 seq, u32 ack,
848 u32 win, u32 tsval, u32 tsecr, int oif,
849 struct tcp_md5sig_key *key,
850 int reply_flags, u8 tos)
852 const struct tcphdr *th = tcp_hdr(skb);
855 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
856 #ifdef CONFIG_TCP_MD5SIG
857 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
861 struct net *net = sock_net(sk);
862 struct ip_reply_arg arg;
866 memset(&rep.th, 0, sizeof(struct tcphdr));
867 memset(&arg, 0, sizeof(arg));
869 arg.iov[0].iov_base = (unsigned char *)&rep;
870 arg.iov[0].iov_len = sizeof(rep.th);
872 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
873 (TCPOPT_TIMESTAMP << 8) |
875 rep.opt[1] = htonl(tsval);
876 rep.opt[2] = htonl(tsecr);
877 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
880 /* Swap the send and the receive. */
881 rep.th.dest = th->source;
882 rep.th.source = th->dest;
883 rep.th.doff = arg.iov[0].iov_len / 4;
884 rep.th.seq = htonl(seq);
885 rep.th.ack_seq = htonl(ack);
887 rep.th.window = htons(win);
889 #ifdef CONFIG_TCP_MD5SIG
891 int offset = (tsecr) ? 3 : 0;
893 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
895 (TCPOPT_MD5SIG << 8) |
897 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
898 rep.th.doff = arg.iov[0].iov_len/4;
900 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
901 key, ip_hdr(skb)->saddr,
902 ip_hdr(skb)->daddr, &rep.th);
905 arg.flags = reply_flags;
906 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
907 ip_hdr(skb)->saddr, /* XXX */
908 arg.iov[0].iov_len, IPPROTO_TCP, 0);
909 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
911 arg.bound_dev_if = oif;
913 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
915 ctl_sk = this_cpu_read(ipv4_tcp_sk);
916 sock_net_set(ctl_sk, net);
917 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
918 inet_twsk(sk)->tw_mark : sk->sk_mark;
919 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
920 inet_twsk(sk)->tw_priority : sk->sk_priority;
921 transmit_time = tcp_transmit_time(sk);
922 ip_send_unicast_reply(ctl_sk,
923 skb, &TCP_SKB_CB(skb)->header.h4.opt,
924 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
925 &arg, arg.iov[0].iov_len,
929 sock_net_set(ctl_sk, &init_net);
930 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
934 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
936 struct inet_timewait_sock *tw = inet_twsk(sk);
937 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
939 tcp_v4_send_ack(sk, skb,
940 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
941 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
942 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
945 tcp_twsk_md5_key(tcptw),
946 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
953 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
954 struct request_sock *req)
956 const union tcp_md5_addr *addr;
959 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
960 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
962 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
966 * The window field (SEG.WND) of every outgoing segment, with the
967 * exception of <SYN> segments, MUST be right-shifted by
968 * Rcv.Wind.Shift bits:
970 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
971 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
972 tcp_v4_send_ack(sk, skb, seq,
973 tcp_rsk(req)->rcv_nxt,
974 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
975 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
978 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
979 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
984 * Send a SYN-ACK after having received a SYN.
985 * This still operates on a request_sock only, not on a big
988 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
990 struct request_sock *req,
991 struct tcp_fastopen_cookie *foc,
992 enum tcp_synack_type synack_type,
993 struct sk_buff *syn_skb)
995 const struct inet_request_sock *ireq = inet_rsk(req);
1001 /* First, grab a route. */
1002 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1005 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1008 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1010 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1011 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1012 (inet_sk(sk)->tos & INET_ECN_MASK) :
1015 if (!INET_ECN_is_capable(tos) &&
1016 tcp_bpf_ca_needs_ecn((struct sock *)req))
1017 tos |= INET_ECN_ECT_0;
1020 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1022 rcu_dereference(ireq->ireq_opt),
1025 err = net_xmit_eval(err);
1032 * IPv4 request_sock destructor.
1034 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1036 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1039 #ifdef CONFIG_TCP_MD5SIG
1041 * RFC2385 MD5 checksumming requires a mapping of
1042 * IP address->MD5 Key.
1043 * We need to maintain these in the sk structure.
1046 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1047 EXPORT_SYMBOL(tcp_md5_needed);
1049 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1054 /* l3index always overrides non-l3index */
1055 if (old->l3index && new->l3index == 0)
1057 if (old->l3index == 0 && new->l3index)
1060 return old->prefixlen < new->prefixlen;
1063 /* Find the Key structure for an address. */
1064 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1065 const union tcp_md5_addr *addr,
1068 const struct tcp_sock *tp = tcp_sk(sk);
1069 struct tcp_md5sig_key *key;
1070 const struct tcp_md5sig_info *md5sig;
1072 struct tcp_md5sig_key *best_match = NULL;
1075 /* caller either holds rcu_read_lock() or socket lock */
1076 md5sig = rcu_dereference_check(tp->md5sig_info,
1077 lockdep_sock_is_held(sk));
1081 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1082 lockdep_sock_is_held(sk)) {
1083 if (key->family != family)
1085 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1087 if (family == AF_INET) {
1088 mask = inet_make_mask(key->prefixlen);
1089 match = (key->addr.a4.s_addr & mask) ==
1090 (addr->a4.s_addr & mask);
1091 #if IS_ENABLED(CONFIG_IPV6)
1092 } else if (family == AF_INET6) {
1093 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1100 if (match && better_md5_match(best_match, key))
1105 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1107 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1108 const union tcp_md5_addr *addr,
1109 int family, u8 prefixlen,
1110 int l3index, u8 flags)
1112 const struct tcp_sock *tp = tcp_sk(sk);
1113 struct tcp_md5sig_key *key;
1114 unsigned int size = sizeof(struct in_addr);
1115 const struct tcp_md5sig_info *md5sig;
1117 /* caller either holds rcu_read_lock() or socket lock */
1118 md5sig = rcu_dereference_check(tp->md5sig_info,
1119 lockdep_sock_is_held(sk));
1122 #if IS_ENABLED(CONFIG_IPV6)
1123 if (family == AF_INET6)
1124 size = sizeof(struct in6_addr);
1126 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1127 lockdep_sock_is_held(sk)) {
1128 if (key->family != family)
1130 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1132 if (key->l3index != l3index)
1134 if (!memcmp(&key->addr, addr, size) &&
1135 key->prefixlen == prefixlen)
1141 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1142 const struct sock *addr_sk)
1144 const union tcp_md5_addr *addr;
1147 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1148 addr_sk->sk_bound_dev_if);
1149 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1150 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1154 /* This can be called on a newly created socket, from other files */
1155 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1156 int family, u8 prefixlen, int l3index, u8 flags,
1157 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1159 /* Add Key to the list */
1160 struct tcp_md5sig_key *key;
1161 struct tcp_sock *tp = tcp_sk(sk);
1162 struct tcp_md5sig_info *md5sig;
1164 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1166 /* Pre-existing entry - just update that one.
1167 * Note that the key might be used concurrently.
1168 * data_race() is telling kcsan that we do not care of
1169 * key mismatches, since changing MD5 key on live flows
1170 * can lead to packet drops.
1172 data_race(memcpy(key->key, newkey, newkeylen));
1174 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1175 * Also note that a reader could catch new key->keylen value
1176 * but old key->key[], this is the reason we use __GFP_ZERO
1177 * at sock_kmalloc() time below these lines.
1179 WRITE_ONCE(key->keylen, newkeylen);
1184 md5sig = rcu_dereference_protected(tp->md5sig_info,
1185 lockdep_sock_is_held(sk));
1187 md5sig = kmalloc(sizeof(*md5sig), gfp);
1192 INIT_HLIST_HEAD(&md5sig->head);
1193 rcu_assign_pointer(tp->md5sig_info, md5sig);
1196 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1199 if (!tcp_alloc_md5sig_pool()) {
1200 sock_kfree_s(sk, key, sizeof(*key));
1204 memcpy(key->key, newkey, newkeylen);
1205 key->keylen = newkeylen;
1206 key->family = family;
1207 key->prefixlen = prefixlen;
1208 key->l3index = l3index;
1210 memcpy(&key->addr, addr,
1211 (family == AF_INET6) ? sizeof(struct in6_addr) :
1212 sizeof(struct in_addr));
1213 hlist_add_head_rcu(&key->node, &md5sig->head);
1216 EXPORT_SYMBOL(tcp_md5_do_add);
1218 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1219 u8 prefixlen, int l3index, u8 flags)
1221 struct tcp_md5sig_key *key;
1223 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1226 hlist_del_rcu(&key->node);
1227 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1228 kfree_rcu(key, rcu);
1231 EXPORT_SYMBOL(tcp_md5_do_del);
1233 static void tcp_clear_md5_list(struct sock *sk)
1235 struct tcp_sock *tp = tcp_sk(sk);
1236 struct tcp_md5sig_key *key;
1237 struct hlist_node *n;
1238 struct tcp_md5sig_info *md5sig;
1240 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1242 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1243 hlist_del_rcu(&key->node);
1244 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1245 kfree_rcu(key, rcu);
1249 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1250 sockptr_t optval, int optlen)
1252 struct tcp_md5sig cmd;
1253 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1254 const union tcp_md5_addr *addr;
1259 if (optlen < sizeof(cmd))
1262 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1265 if (sin->sin_family != AF_INET)
1268 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1270 if (optname == TCP_MD5SIG_EXT &&
1271 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1272 prefixlen = cmd.tcpm_prefixlen;
1277 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1278 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1279 struct net_device *dev;
1282 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1283 if (dev && netif_is_l3_master(dev))
1284 l3index = dev->ifindex;
1288 /* ok to reference set/not set outside of rcu;
1289 * right now device MUST be an L3 master
1291 if (!dev || !l3index)
1295 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1297 if (!cmd.tcpm_keylen)
1298 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1300 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1303 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1304 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1307 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1308 __be32 daddr, __be32 saddr,
1309 const struct tcphdr *th, int nbytes)
1311 struct tcp4_pseudohdr *bp;
1312 struct scatterlist sg;
1319 bp->protocol = IPPROTO_TCP;
1320 bp->len = cpu_to_be16(nbytes);
1322 _th = (struct tcphdr *)(bp + 1);
1323 memcpy(_th, th, sizeof(*th));
1326 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1327 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1328 sizeof(*bp) + sizeof(*th));
1329 return crypto_ahash_update(hp->md5_req);
1332 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1333 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1335 struct tcp_md5sig_pool *hp;
1336 struct ahash_request *req;
1338 hp = tcp_get_md5sig_pool();
1340 goto clear_hash_noput;
1343 if (crypto_ahash_init(req))
1345 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1347 if (tcp_md5_hash_key(hp, key))
1349 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1350 if (crypto_ahash_final(req))
1353 tcp_put_md5sig_pool();
1357 tcp_put_md5sig_pool();
1359 memset(md5_hash, 0, 16);
1363 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1364 const struct sock *sk,
1365 const struct sk_buff *skb)
1367 struct tcp_md5sig_pool *hp;
1368 struct ahash_request *req;
1369 const struct tcphdr *th = tcp_hdr(skb);
1370 __be32 saddr, daddr;
1372 if (sk) { /* valid for establish/request sockets */
1373 saddr = sk->sk_rcv_saddr;
1374 daddr = sk->sk_daddr;
1376 const struct iphdr *iph = ip_hdr(skb);
1381 hp = tcp_get_md5sig_pool();
1383 goto clear_hash_noput;
1386 if (crypto_ahash_init(req))
1389 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1391 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1393 if (tcp_md5_hash_key(hp, key))
1395 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1396 if (crypto_ahash_final(req))
1399 tcp_put_md5sig_pool();
1403 tcp_put_md5sig_pool();
1405 memset(md5_hash, 0, 16);
1408 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1412 static void tcp_v4_init_req(struct request_sock *req,
1413 const struct sock *sk_listener,
1414 struct sk_buff *skb)
1416 struct inet_request_sock *ireq = inet_rsk(req);
1417 struct net *net = sock_net(sk_listener);
1419 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1420 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1421 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1424 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1425 struct sk_buff *skb,
1427 struct request_sock *req)
1429 tcp_v4_init_req(req, sk, skb);
1431 if (security_inet_conn_request(sk, skb, req))
1434 return inet_csk_route_req(sk, &fl->u.ip4, req);
1437 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1439 .obj_size = sizeof(struct tcp_request_sock),
1440 .rtx_syn_ack = tcp_rtx_synack,
1441 .send_ack = tcp_v4_reqsk_send_ack,
1442 .destructor = tcp_v4_reqsk_destructor,
1443 .send_reset = tcp_v4_send_reset,
1444 .syn_ack_timeout = tcp_syn_ack_timeout,
1447 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1448 .mss_clamp = TCP_MSS_DEFAULT,
1449 #ifdef CONFIG_TCP_MD5SIG
1450 .req_md5_lookup = tcp_v4_md5_lookup,
1451 .calc_md5_hash = tcp_v4_md5_hash_skb,
1453 #ifdef CONFIG_SYN_COOKIES
1454 .cookie_init_seq = cookie_v4_init_sequence,
1456 .route_req = tcp_v4_route_req,
1457 .init_seq = tcp_v4_init_seq,
1458 .init_ts_off = tcp_v4_init_ts_off,
1459 .send_synack = tcp_v4_send_synack,
1462 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1464 /* Never answer to SYNs send to broadcast or multicast */
1465 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1468 return tcp_conn_request(&tcp_request_sock_ops,
1469 &tcp_request_sock_ipv4_ops, sk, skb);
1475 EXPORT_SYMBOL(tcp_v4_conn_request);
1479 * The three way handshake has completed - we got a valid synack -
1480 * now create the new socket.
1482 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1483 struct request_sock *req,
1484 struct dst_entry *dst,
1485 struct request_sock *req_unhash,
1488 struct inet_request_sock *ireq;
1489 bool found_dup_sk = false;
1490 struct inet_sock *newinet;
1491 struct tcp_sock *newtp;
1493 #ifdef CONFIG_TCP_MD5SIG
1494 const union tcp_md5_addr *addr;
1495 struct tcp_md5sig_key *key;
1498 struct ip_options_rcu *inet_opt;
1500 if (sk_acceptq_is_full(sk))
1503 newsk = tcp_create_openreq_child(sk, req, skb);
1507 newsk->sk_gso_type = SKB_GSO_TCPV4;
1508 inet_sk_rx_dst_set(newsk, skb);
1510 newtp = tcp_sk(newsk);
1511 newinet = inet_sk(newsk);
1512 ireq = inet_rsk(req);
1513 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1514 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1515 newsk->sk_bound_dev_if = ireq->ir_iif;
1516 newinet->inet_saddr = ireq->ir_loc_addr;
1517 inet_opt = rcu_dereference(ireq->ireq_opt);
1518 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1519 newinet->mc_index = inet_iif(skb);
1520 newinet->mc_ttl = ip_hdr(skb)->ttl;
1521 newinet->rcv_tos = ip_hdr(skb)->tos;
1522 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1524 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1525 newinet->inet_id = prandom_u32();
1527 /* Set ToS of the new socket based upon the value of incoming SYN.
1528 * ECT bits are set later in tcp_init_transfer().
1530 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1531 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1534 dst = inet_csk_route_child_sock(sk, newsk, req);
1538 /* syncookie case : see end of cookie_v4_check() */
1540 sk_setup_caps(newsk, dst);
1542 tcp_ca_openreq_child(newsk, dst);
1544 tcp_sync_mss(newsk, dst_mtu(dst));
1545 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1547 tcp_initialize_rcv_mss(newsk);
1549 #ifdef CONFIG_TCP_MD5SIG
1550 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1551 /* Copy over the MD5 key from the original socket */
1552 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1553 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1556 * We're using one, so create a matching key
1557 * on the newsk structure. If we fail to get
1558 * memory, then we end up not copying the key
1561 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1562 key->key, key->keylen, GFP_ATOMIC);
1563 sk_gso_disable(newsk);
1567 if (__inet_inherit_port(sk, newsk) < 0)
1569 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1571 if (likely(*own_req)) {
1572 tcp_move_syn(newtp, req);
1573 ireq->ireq_opt = NULL;
1575 newinet->inet_opt = NULL;
1577 if (!req_unhash && found_dup_sk) {
1578 /* This code path should only be executed in the
1579 * syncookie case only
1581 bh_unlock_sock(newsk);
1589 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1596 newinet->inet_opt = NULL;
1597 inet_csk_prepare_forced_close(newsk);
1601 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1603 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1605 #ifdef CONFIG_SYN_COOKIES
1606 const struct tcphdr *th = tcp_hdr(skb);
1609 sk = cookie_v4_check(sk, skb);
1614 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1615 struct tcphdr *th, u32 *cookie)
1618 #ifdef CONFIG_SYN_COOKIES
1619 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1620 &tcp_request_sock_ipv4_ops, sk, th);
1622 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1623 tcp_synq_overflow(sk);
1629 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1631 /* The socket must have it's spinlock held when we get
1632 * here, unless it is a TCP_LISTEN socket.
1634 * We have a potential double-lock case here, so even when
1635 * doing backlog processing we use the BH locking scheme.
1636 * This is because we cannot sleep with the original spinlock
1639 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641 enum skb_drop_reason reason;
1644 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1645 struct dst_entry *dst;
1647 dst = rcu_dereference_protected(sk->sk_rx_dst,
1648 lockdep_sock_is_held(sk));
1650 sock_rps_save_rxhash(sk, skb);
1651 sk_mark_napi_id(sk, skb);
1653 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1654 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1656 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1660 tcp_rcv_established(sk, skb);
1664 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1665 if (tcp_checksum_complete(skb))
1668 if (sk->sk_state == TCP_LISTEN) {
1669 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1674 if (tcp_child_process(sk, nsk, skb)) {
1681 sock_rps_save_rxhash(sk, skb);
1683 if (tcp_rcv_state_process(sk, skb)) {
1690 tcp_v4_send_reset(rsk, skb);
1692 kfree_skb_reason(skb, reason);
1693 /* Be careful here. If this function gets more complicated and
1694 * gcc suffers from register pressure on the x86, sk (in %ebx)
1695 * might be destroyed here. This current version compiles correctly,
1696 * but you have been warned.
1701 reason = SKB_DROP_REASON_TCP_CSUM;
1702 trace_tcp_bad_csum(skb);
1703 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1704 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1707 EXPORT_SYMBOL(tcp_v4_do_rcv);
1709 int tcp_v4_early_demux(struct sk_buff *skb)
1711 const struct iphdr *iph;
1712 const struct tcphdr *th;
1715 if (skb->pkt_type != PACKET_HOST)
1718 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1724 if (th->doff < sizeof(struct tcphdr) / 4)
1727 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1728 iph->saddr, th->source,
1729 iph->daddr, ntohs(th->dest),
1730 skb->skb_iif, inet_sdif(skb));
1733 skb->destructor = sock_edemux;
1734 if (sk_fullsock(sk)) {
1735 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1738 dst = dst_check(dst, 0);
1740 sk->sk_rx_dst_ifindex == skb->skb_iif)
1741 skb_dst_set_noref(skb, dst);
1747 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1748 enum skb_drop_reason *reason)
1750 u32 limit, tail_gso_size, tail_gso_segs;
1751 struct skb_shared_info *shinfo;
1752 const struct tcphdr *th;
1753 struct tcphdr *thtail;
1754 struct sk_buff *tail;
1755 unsigned int hdrlen;
1761 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1762 * we can fix skb->truesize to its real value to avoid future drops.
1763 * This is valid because skb is not yet charged to the socket.
1764 * It has been noticed pure SACK packets were sometimes dropped
1765 * (if cooked by drivers without copybreak feature).
1771 if (unlikely(tcp_checksum_complete(skb))) {
1773 trace_tcp_bad_csum(skb);
1774 *reason = SKB_DROP_REASON_TCP_CSUM;
1775 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1780 /* Attempt coalescing to last skb in backlog, even if we are
1782 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1784 th = (const struct tcphdr *)skb->data;
1785 hdrlen = th->doff * 4;
1787 tail = sk->sk_backlog.tail;
1790 thtail = (struct tcphdr *)tail->data;
1792 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1793 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1794 ((TCP_SKB_CB(tail)->tcp_flags |
1795 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1796 !((TCP_SKB_CB(tail)->tcp_flags &
1797 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1798 ((TCP_SKB_CB(tail)->tcp_flags ^
1799 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1800 #ifdef CONFIG_TLS_DEVICE
1801 tail->decrypted != skb->decrypted ||
1803 thtail->doff != th->doff ||
1804 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1807 __skb_pull(skb, hdrlen);
1809 shinfo = skb_shinfo(skb);
1810 gso_size = shinfo->gso_size ?: skb->len;
1811 gso_segs = shinfo->gso_segs ?: 1;
1813 shinfo = skb_shinfo(tail);
1814 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1815 tail_gso_segs = shinfo->gso_segs ?: 1;
1817 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1818 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1820 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1821 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1822 thtail->window = th->window;
1825 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1826 * thtail->fin, so that the fast path in tcp_rcv_established()
1827 * is not entered if we append a packet with a FIN.
1828 * SYN, RST, URG are not present.
1829 * ACK is set on both packets.
1830 * PSH : we do not really care in TCP stack,
1831 * at least for 'GRO' packets.
1833 thtail->fin |= th->fin;
1834 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1836 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1837 TCP_SKB_CB(tail)->has_rxtstamp = true;
1838 tail->tstamp = skb->tstamp;
1839 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1842 /* Not as strict as GRO. We only need to carry mss max value */
1843 shinfo->gso_size = max(gso_size, tail_gso_size);
1844 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1846 sk->sk_backlog.len += delta;
1847 __NET_INC_STATS(sock_net(sk),
1848 LINUX_MIB_TCPBACKLOGCOALESCE);
1849 kfree_skb_partial(skb, fragstolen);
1852 __skb_push(skb, hdrlen);
1855 /* Only socket owner can try to collapse/prune rx queues
1856 * to reduce memory overhead, so add a little headroom here.
1857 * Few sockets backlog are possibly concurrently non empty.
1859 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1861 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1863 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1864 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1869 EXPORT_SYMBOL(tcp_add_backlog);
1871 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1873 struct tcphdr *th = (struct tcphdr *)skb->data;
1875 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1877 EXPORT_SYMBOL(tcp_filter);
1879 static void tcp_v4_restore_cb(struct sk_buff *skb)
1881 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1882 sizeof(struct inet_skb_parm));
1885 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1886 const struct tcphdr *th)
1888 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1889 * barrier() makes sure compiler wont play fool^Waliasing games.
1891 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1892 sizeof(struct inet_skb_parm));
1895 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1896 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1897 skb->len - th->doff * 4);
1898 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1899 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1900 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1901 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1902 TCP_SKB_CB(skb)->sacked = 0;
1903 TCP_SKB_CB(skb)->has_rxtstamp =
1904 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1911 int tcp_v4_rcv(struct sk_buff *skb)
1913 struct net *net = dev_net(skb->dev);
1914 enum skb_drop_reason drop_reason;
1915 int sdif = inet_sdif(skb);
1916 int dif = inet_iif(skb);
1917 const struct iphdr *iph;
1918 const struct tcphdr *th;
1923 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1924 if (skb->pkt_type != PACKET_HOST)
1927 /* Count it even if it's bad */
1928 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1930 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1933 th = (const struct tcphdr *)skb->data;
1935 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1936 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1939 if (!pskb_may_pull(skb, th->doff * 4))
1942 /* An explanation is required here, I think.
1943 * Packet length and doff are validated by header prediction,
1944 * provided case of th->doff==0 is eliminated.
1945 * So, we defer the checks. */
1947 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1950 th = (const struct tcphdr *)skb->data;
1953 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1954 th->dest, sdif, &refcounted);
1959 if (sk->sk_state == TCP_TIME_WAIT)
1962 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1963 struct request_sock *req = inet_reqsk(sk);
1964 bool req_stolen = false;
1967 sk = req->rsk_listener;
1968 drop_reason = tcp_inbound_md5_hash(sk, skb,
1969 &iph->saddr, &iph->daddr,
1970 AF_INET, dif, sdif);
1971 if (unlikely(drop_reason)) {
1972 sk_drops_add(sk, skb);
1976 if (tcp_checksum_complete(skb)) {
1980 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1981 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1983 inet_csk_reqsk_queue_drop_and_put(sk, req);
1987 /* reuseport_migrate_sock() has already held one sk_refcnt
1991 /* We own a reference on the listener, increase it again
1992 * as we might lose it too soon.
1998 if (!tcp_filter(sk, skb)) {
1999 th = (const struct tcphdr *)skb->data;
2001 tcp_v4_fill_cb(skb, iph, th);
2002 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2004 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2009 /* Another cpu got exclusive access to req
2010 * and created a full blown socket.
2011 * Try to feed this packet to this socket
2012 * instead of discarding it.
2014 tcp_v4_restore_cb(skb);
2018 goto discard_and_relse;
2022 tcp_v4_restore_cb(skb);
2023 } else if (tcp_child_process(sk, nsk, skb)) {
2024 tcp_v4_send_reset(nsk, skb);
2025 goto discard_and_relse;
2032 if (static_branch_unlikely(&ip4_min_ttl)) {
2033 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2034 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2035 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2036 goto discard_and_relse;
2040 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2041 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2042 goto discard_and_relse;
2045 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2046 &iph->daddr, AF_INET, dif, sdif);
2048 goto discard_and_relse;
2052 if (tcp_filter(sk, skb)) {
2053 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2054 goto discard_and_relse;
2056 th = (const struct tcphdr *)skb->data;
2058 tcp_v4_fill_cb(skb, iph, th);
2062 if (sk->sk_state == TCP_LISTEN) {
2063 ret = tcp_v4_do_rcv(sk, skb);
2064 goto put_and_return;
2067 sk_incoming_cpu_update(sk);
2069 sk_defer_free_flush(sk);
2070 bh_lock_sock_nested(sk);
2071 tcp_segs_in(tcp_sk(sk), skb);
2073 if (!sock_owned_by_user(sk)) {
2074 ret = tcp_v4_do_rcv(sk, skb);
2076 if (tcp_add_backlog(sk, skb, &drop_reason))
2077 goto discard_and_relse;
2088 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2089 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2092 tcp_v4_fill_cb(skb, iph, th);
2094 if (tcp_checksum_complete(skb)) {
2096 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2097 trace_tcp_bad_csum(skb);
2098 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2100 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2102 tcp_v4_send_reset(NULL, skb);
2106 /* Discard frame. */
2107 kfree_skb_reason(skb, drop_reason);
2111 sk_drops_add(sk, skb);
2117 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2118 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2119 inet_twsk_put(inet_twsk(sk));
2123 tcp_v4_fill_cb(skb, iph, th);
2125 if (tcp_checksum_complete(skb)) {
2126 inet_twsk_put(inet_twsk(sk));
2129 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2131 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2134 iph->saddr, th->source,
2135 iph->daddr, th->dest,
2139 inet_twsk_deschedule_put(inet_twsk(sk));
2141 tcp_v4_restore_cb(skb);
2149 tcp_v4_timewait_ack(sk, skb);
2152 tcp_v4_send_reset(sk, skb);
2153 inet_twsk_deschedule_put(inet_twsk(sk));
2155 case TCP_TW_SUCCESS:;
2160 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2161 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2162 .twsk_unique = tcp_twsk_unique,
2163 .twsk_destructor= tcp_twsk_destructor,
2166 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2168 struct dst_entry *dst = skb_dst(skb);
2170 if (dst && dst_hold_safe(dst)) {
2171 rcu_assign_pointer(sk->sk_rx_dst, dst);
2172 sk->sk_rx_dst_ifindex = skb->skb_iif;
2175 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2177 const struct inet_connection_sock_af_ops ipv4_specific = {
2178 .queue_xmit = ip_queue_xmit,
2179 .send_check = tcp_v4_send_check,
2180 .rebuild_header = inet_sk_rebuild_header,
2181 .sk_rx_dst_set = inet_sk_rx_dst_set,
2182 .conn_request = tcp_v4_conn_request,
2183 .syn_recv_sock = tcp_v4_syn_recv_sock,
2184 .net_header_len = sizeof(struct iphdr),
2185 .setsockopt = ip_setsockopt,
2186 .getsockopt = ip_getsockopt,
2187 .addr2sockaddr = inet_csk_addr2sockaddr,
2188 .sockaddr_len = sizeof(struct sockaddr_in),
2189 .mtu_reduced = tcp_v4_mtu_reduced,
2191 EXPORT_SYMBOL(ipv4_specific);
2193 #ifdef CONFIG_TCP_MD5SIG
2194 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2195 .md5_lookup = tcp_v4_md5_lookup,
2196 .calc_md5_hash = tcp_v4_md5_hash_skb,
2197 .md5_parse = tcp_v4_parse_md5_keys,
2201 /* NOTE: A lot of things set to zero explicitly by call to
2202 * sk_alloc() so need not be done here.
2204 static int tcp_v4_init_sock(struct sock *sk)
2206 struct inet_connection_sock *icsk = inet_csk(sk);
2210 icsk->icsk_af_ops = &ipv4_specific;
2212 #ifdef CONFIG_TCP_MD5SIG
2213 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2219 void tcp_v4_destroy_sock(struct sock *sk)
2221 struct tcp_sock *tp = tcp_sk(sk);
2223 trace_tcp_destroy_sock(sk);
2225 tcp_clear_xmit_timers(sk);
2227 tcp_cleanup_congestion_control(sk);
2229 tcp_cleanup_ulp(sk);
2231 /* Cleanup up the write buffer. */
2232 tcp_write_queue_purge(sk);
2234 /* Check if we want to disable active TFO */
2235 tcp_fastopen_active_disable_ofo_check(sk);
2237 /* Cleans up our, hopefully empty, out_of_order_queue. */
2238 skb_rbtree_purge(&tp->out_of_order_queue);
2240 #ifdef CONFIG_TCP_MD5SIG
2241 /* Clean up the MD5 key list, if any */
2242 if (tp->md5sig_info) {
2243 tcp_clear_md5_list(sk);
2244 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2245 tp->md5sig_info = NULL;
2249 /* Clean up a referenced TCP bind bucket. */
2250 if (inet_csk(sk)->icsk_bind_hash)
2253 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2255 /* If socket is aborted during connect operation */
2256 tcp_free_fastopen_req(tp);
2257 tcp_fastopen_destroy_cipher(sk);
2258 tcp_saved_syn_free(tp);
2260 sk_sockets_allocated_dec(sk);
2262 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2264 #ifdef CONFIG_PROC_FS
2265 /* Proc filesystem TCP sock list dumping. */
2267 static unsigned short seq_file_family(const struct seq_file *seq);
2269 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2271 unsigned short family = seq_file_family(seq);
2273 /* AF_UNSPEC is used as a match all */
2274 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2275 net_eq(sock_net(sk), seq_file_net(seq)));
2278 /* Find a non empty bucket (starting from st->bucket)
2279 * and return the first sk from it.
2281 static void *listening_get_first(struct seq_file *seq)
2283 struct tcp_iter_state *st = seq->private;
2286 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2287 struct inet_listen_hashbucket *ilb2;
2288 struct inet_connection_sock *icsk;
2291 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2292 if (hlist_empty(&ilb2->head))
2295 spin_lock(&ilb2->lock);
2296 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2297 sk = (struct sock *)icsk;
2298 if (seq_sk_match(seq, sk))
2301 spin_unlock(&ilb2->lock);
2307 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2308 * If "cur" is the last one in the st->bucket,
2309 * call listening_get_first() to return the first sk of the next
2312 static void *listening_get_next(struct seq_file *seq, void *cur)
2314 struct tcp_iter_state *st = seq->private;
2315 struct inet_listen_hashbucket *ilb2;
2316 struct inet_connection_sock *icsk;
2317 struct sock *sk = cur;
2322 icsk = inet_csk(sk);
2323 inet_lhash2_for_each_icsk_continue(icsk) {
2324 sk = (struct sock *)icsk;
2325 if (seq_sk_match(seq, sk))
2329 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2330 spin_unlock(&ilb2->lock);
2332 return listening_get_first(seq);
2335 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2337 struct tcp_iter_state *st = seq->private;
2342 rc = listening_get_first(seq);
2344 while (rc && *pos) {
2345 rc = listening_get_next(seq, rc);
2351 static inline bool empty_bucket(const struct tcp_iter_state *st)
2353 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2357 * Get first established socket starting from bucket given in st->bucket.
2358 * If st->bucket is zero, the very first socket in the hash is returned.
2360 static void *established_get_first(struct seq_file *seq)
2362 struct tcp_iter_state *st = seq->private;
2365 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2367 struct hlist_nulls_node *node;
2368 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2370 /* Lockless fast path for the common case of empty buckets */
2371 if (empty_bucket(st))
2375 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2376 if (seq_sk_match(seq, sk))
2379 spin_unlock_bh(lock);
2385 static void *established_get_next(struct seq_file *seq, void *cur)
2387 struct sock *sk = cur;
2388 struct hlist_nulls_node *node;
2389 struct tcp_iter_state *st = seq->private;
2394 sk = sk_nulls_next(sk);
2396 sk_nulls_for_each_from(sk, node) {
2397 if (seq_sk_match(seq, sk))
2401 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2403 return established_get_first(seq);
2406 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2408 struct tcp_iter_state *st = seq->private;
2412 rc = established_get_first(seq);
2415 rc = established_get_next(seq, rc);
2421 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2424 struct tcp_iter_state *st = seq->private;
2426 st->state = TCP_SEQ_STATE_LISTENING;
2427 rc = listening_get_idx(seq, &pos);
2430 st->state = TCP_SEQ_STATE_ESTABLISHED;
2431 rc = established_get_idx(seq, pos);
2437 static void *tcp_seek_last_pos(struct seq_file *seq)
2439 struct tcp_iter_state *st = seq->private;
2440 int bucket = st->bucket;
2441 int offset = st->offset;
2442 int orig_num = st->num;
2445 switch (st->state) {
2446 case TCP_SEQ_STATE_LISTENING:
2447 if (st->bucket > tcp_hashinfo.lhash2_mask)
2449 st->state = TCP_SEQ_STATE_LISTENING;
2450 rc = listening_get_first(seq);
2451 while (offset-- && rc && bucket == st->bucket)
2452 rc = listening_get_next(seq, rc);
2456 st->state = TCP_SEQ_STATE_ESTABLISHED;
2458 case TCP_SEQ_STATE_ESTABLISHED:
2459 if (st->bucket > tcp_hashinfo.ehash_mask)
2461 rc = established_get_first(seq);
2462 while (offset-- && rc && bucket == st->bucket)
2463 rc = established_get_next(seq, rc);
2471 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2473 struct tcp_iter_state *st = seq->private;
2476 if (*pos && *pos == st->last_pos) {
2477 rc = tcp_seek_last_pos(seq);
2482 st->state = TCP_SEQ_STATE_LISTENING;
2486 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2489 st->last_pos = *pos;
2492 EXPORT_SYMBOL(tcp_seq_start);
2494 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2496 struct tcp_iter_state *st = seq->private;
2499 if (v == SEQ_START_TOKEN) {
2500 rc = tcp_get_idx(seq, 0);
2504 switch (st->state) {
2505 case TCP_SEQ_STATE_LISTENING:
2506 rc = listening_get_next(seq, v);
2508 st->state = TCP_SEQ_STATE_ESTABLISHED;
2511 rc = established_get_first(seq);
2514 case TCP_SEQ_STATE_ESTABLISHED:
2515 rc = established_get_next(seq, v);
2520 st->last_pos = *pos;
2523 EXPORT_SYMBOL(tcp_seq_next);
2525 void tcp_seq_stop(struct seq_file *seq, void *v)
2527 struct tcp_iter_state *st = seq->private;
2529 switch (st->state) {
2530 case TCP_SEQ_STATE_LISTENING:
2531 if (v != SEQ_START_TOKEN)
2532 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2534 case TCP_SEQ_STATE_ESTABLISHED:
2536 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2540 EXPORT_SYMBOL(tcp_seq_stop);
2542 static void get_openreq4(const struct request_sock *req,
2543 struct seq_file *f, int i)
2545 const struct inet_request_sock *ireq = inet_rsk(req);
2546 long delta = req->rsk_timer.expires - jiffies;
2548 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2549 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2554 ntohs(ireq->ir_rmt_port),
2556 0, 0, /* could print option size, but that is af dependent. */
2557 1, /* timers active (only the expire timer) */
2558 jiffies_delta_to_clock_t(delta),
2560 from_kuid_munged(seq_user_ns(f),
2561 sock_i_uid(req->rsk_listener)),
2562 0, /* non standard timer */
2563 0, /* open_requests have no inode */
2568 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2571 unsigned long timer_expires;
2572 const struct tcp_sock *tp = tcp_sk(sk);
2573 const struct inet_connection_sock *icsk = inet_csk(sk);
2574 const struct inet_sock *inet = inet_sk(sk);
2575 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2576 __be32 dest = inet->inet_daddr;
2577 __be32 src = inet->inet_rcv_saddr;
2578 __u16 destp = ntohs(inet->inet_dport);
2579 __u16 srcp = ntohs(inet->inet_sport);
2583 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2584 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2585 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2587 timer_expires = icsk->icsk_timeout;
2588 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2590 timer_expires = icsk->icsk_timeout;
2591 } else if (timer_pending(&sk->sk_timer)) {
2593 timer_expires = sk->sk_timer.expires;
2596 timer_expires = jiffies;
2599 state = inet_sk_state_load(sk);
2600 if (state == TCP_LISTEN)
2601 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2603 /* Because we don't lock the socket,
2604 * we might find a transient negative value.
2606 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2607 READ_ONCE(tp->copied_seq), 0);
2609 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2610 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2611 i, src, srcp, dest, destp, state,
2612 READ_ONCE(tp->write_seq) - tp->snd_una,
2615 jiffies_delta_to_clock_t(timer_expires - jiffies),
2616 icsk->icsk_retransmits,
2617 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2618 icsk->icsk_probes_out,
2620 refcount_read(&sk->sk_refcnt), sk,
2621 jiffies_to_clock_t(icsk->icsk_rto),
2622 jiffies_to_clock_t(icsk->icsk_ack.ato),
2623 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2625 state == TCP_LISTEN ?
2626 fastopenq->max_qlen :
2627 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2630 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2631 struct seq_file *f, int i)
2633 long delta = tw->tw_timer.expires - jiffies;
2637 dest = tw->tw_daddr;
2638 src = tw->tw_rcv_saddr;
2639 destp = ntohs(tw->tw_dport);
2640 srcp = ntohs(tw->tw_sport);
2642 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2643 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2644 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2645 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2646 refcount_read(&tw->tw_refcnt), tw);
2651 static int tcp4_seq_show(struct seq_file *seq, void *v)
2653 struct tcp_iter_state *st;
2654 struct sock *sk = v;
2656 seq_setwidth(seq, TMPSZ - 1);
2657 if (v == SEQ_START_TOKEN) {
2658 seq_puts(seq, " sl local_address rem_address st tx_queue "
2659 "rx_queue tr tm->when retrnsmt uid timeout "
2665 if (sk->sk_state == TCP_TIME_WAIT)
2666 get_timewait4_sock(v, seq, st->num);
2667 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2668 get_openreq4(v, seq, st->num);
2670 get_tcp4_sock(v, seq, st->num);
2676 #ifdef CONFIG_BPF_SYSCALL
2677 struct bpf_tcp_iter_state {
2678 struct tcp_iter_state state;
2679 unsigned int cur_sk;
2680 unsigned int end_sk;
2681 unsigned int max_sk;
2682 struct sock **batch;
2683 bool st_bucket_done;
2686 struct bpf_iter__tcp {
2687 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2688 __bpf_md_ptr(struct sock_common *, sk_common);
2689 uid_t uid __aligned(8);
2692 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2693 struct sock_common *sk_common, uid_t uid)
2695 struct bpf_iter__tcp ctx;
2697 meta->seq_num--; /* skip SEQ_START_TOKEN */
2699 ctx.sk_common = sk_common;
2701 return bpf_iter_run_prog(prog, &ctx);
2704 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2706 while (iter->cur_sk < iter->end_sk)
2707 sock_put(iter->batch[iter->cur_sk++]);
2710 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2711 unsigned int new_batch_sz)
2713 struct sock **new_batch;
2715 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2716 GFP_USER | __GFP_NOWARN);
2720 bpf_iter_tcp_put_batch(iter);
2721 kvfree(iter->batch);
2722 iter->batch = new_batch;
2723 iter->max_sk = new_batch_sz;
2728 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2729 struct sock *start_sk)
2731 struct bpf_tcp_iter_state *iter = seq->private;
2732 struct tcp_iter_state *st = &iter->state;
2733 struct inet_connection_sock *icsk;
2734 unsigned int expected = 1;
2737 sock_hold(start_sk);
2738 iter->batch[iter->end_sk++] = start_sk;
2740 icsk = inet_csk(start_sk);
2741 inet_lhash2_for_each_icsk_continue(icsk) {
2742 sk = (struct sock *)icsk;
2743 if (seq_sk_match(seq, sk)) {
2744 if (iter->end_sk < iter->max_sk) {
2746 iter->batch[iter->end_sk++] = sk;
2751 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2756 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2757 struct sock *start_sk)
2759 struct bpf_tcp_iter_state *iter = seq->private;
2760 struct tcp_iter_state *st = &iter->state;
2761 struct hlist_nulls_node *node;
2762 unsigned int expected = 1;
2765 sock_hold(start_sk);
2766 iter->batch[iter->end_sk++] = start_sk;
2768 sk = sk_nulls_next(start_sk);
2769 sk_nulls_for_each_from(sk, node) {
2770 if (seq_sk_match(seq, sk)) {
2771 if (iter->end_sk < iter->max_sk) {
2773 iter->batch[iter->end_sk++] = sk;
2778 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2783 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2785 struct bpf_tcp_iter_state *iter = seq->private;
2786 struct tcp_iter_state *st = &iter->state;
2787 unsigned int expected;
2788 bool resized = false;
2791 /* The st->bucket is done. Directly advance to the next
2792 * bucket instead of having the tcp_seek_last_pos() to skip
2793 * one by one in the current bucket and eventually find out
2794 * it has to advance to the next bucket.
2796 if (iter->st_bucket_done) {
2799 if (st->state == TCP_SEQ_STATE_LISTENING &&
2800 st->bucket > tcp_hashinfo.lhash2_mask) {
2801 st->state = TCP_SEQ_STATE_ESTABLISHED;
2807 /* Get a new batch */
2810 iter->st_bucket_done = false;
2812 sk = tcp_seek_last_pos(seq);
2814 return NULL; /* Done */
2816 if (st->state == TCP_SEQ_STATE_LISTENING)
2817 expected = bpf_iter_tcp_listening_batch(seq, sk);
2819 expected = bpf_iter_tcp_established_batch(seq, sk);
2821 if (iter->end_sk == expected) {
2822 iter->st_bucket_done = true;
2826 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2834 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2836 /* bpf iter does not support lseek, so it always
2837 * continue from where it was stop()-ped.
2840 return bpf_iter_tcp_batch(seq);
2842 return SEQ_START_TOKEN;
2845 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2847 struct bpf_tcp_iter_state *iter = seq->private;
2848 struct tcp_iter_state *st = &iter->state;
2851 /* Whenever seq_next() is called, the iter->cur_sk is
2852 * done with seq_show(), so advance to the next sk in
2855 if (iter->cur_sk < iter->end_sk) {
2856 /* Keeping st->num consistent in tcp_iter_state.
2857 * bpf_iter_tcp does not use st->num.
2858 * meta.seq_num is used instead.
2861 /* Move st->offset to the next sk in the bucket such that
2862 * the future start() will resume at st->offset in
2863 * st->bucket. See tcp_seek_last_pos().
2866 sock_put(iter->batch[iter->cur_sk++]);
2869 if (iter->cur_sk < iter->end_sk)
2870 sk = iter->batch[iter->cur_sk];
2872 sk = bpf_iter_tcp_batch(seq);
2875 /* Keeping st->last_pos consistent in tcp_iter_state.
2876 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2878 st->last_pos = *pos;
2882 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2884 struct bpf_iter_meta meta;
2885 struct bpf_prog *prog;
2886 struct sock *sk = v;
2891 if (v == SEQ_START_TOKEN)
2894 if (sk_fullsock(sk))
2895 slow = lock_sock_fast(sk);
2897 if (unlikely(sk_unhashed(sk))) {
2902 if (sk->sk_state == TCP_TIME_WAIT) {
2904 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2905 const struct request_sock *req = v;
2907 uid = from_kuid_munged(seq_user_ns(seq),
2908 sock_i_uid(req->rsk_listener));
2910 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2914 prog = bpf_iter_get_info(&meta, false);
2915 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2918 if (sk_fullsock(sk))
2919 unlock_sock_fast(sk, slow);
2924 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2926 struct bpf_tcp_iter_state *iter = seq->private;
2927 struct bpf_iter_meta meta;
2928 struct bpf_prog *prog;
2932 prog = bpf_iter_get_info(&meta, true);
2934 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2937 if (iter->cur_sk < iter->end_sk) {
2938 bpf_iter_tcp_put_batch(iter);
2939 iter->st_bucket_done = false;
2943 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2944 .show = bpf_iter_tcp_seq_show,
2945 .start = bpf_iter_tcp_seq_start,
2946 .next = bpf_iter_tcp_seq_next,
2947 .stop = bpf_iter_tcp_seq_stop,
2950 static unsigned short seq_file_family(const struct seq_file *seq)
2952 const struct tcp_seq_afinfo *afinfo;
2954 #ifdef CONFIG_BPF_SYSCALL
2955 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
2956 if (seq->op == &bpf_iter_tcp_seq_ops)
2960 /* Iterated from proc fs */
2961 afinfo = pde_data(file_inode(seq->file));
2962 return afinfo->family;
2965 static const struct seq_operations tcp4_seq_ops = {
2966 .show = tcp4_seq_show,
2967 .start = tcp_seq_start,
2968 .next = tcp_seq_next,
2969 .stop = tcp_seq_stop,
2972 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2976 static int __net_init tcp4_proc_init_net(struct net *net)
2978 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2979 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2984 static void __net_exit tcp4_proc_exit_net(struct net *net)
2986 remove_proc_entry("tcp", net->proc_net);
2989 static struct pernet_operations tcp4_net_ops = {
2990 .init = tcp4_proc_init_net,
2991 .exit = tcp4_proc_exit_net,
2994 int __init tcp4_proc_init(void)
2996 return register_pernet_subsys(&tcp4_net_ops);
2999 void tcp4_proc_exit(void)
3001 unregister_pernet_subsys(&tcp4_net_ops);
3003 #endif /* CONFIG_PROC_FS */
3005 /* @wake is one when sk_stream_write_space() calls us.
3006 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3007 * This mimics the strategy used in sock_def_write_space().
3009 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3011 const struct tcp_sock *tp = tcp_sk(sk);
3012 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3013 READ_ONCE(tp->snd_nxt);
3015 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3017 EXPORT_SYMBOL(tcp_stream_memory_free);
3019 struct proto tcp_prot = {
3021 .owner = THIS_MODULE,
3023 .pre_connect = tcp_v4_pre_connect,
3024 .connect = tcp_v4_connect,
3025 .disconnect = tcp_disconnect,
3026 .accept = inet_csk_accept,
3028 .init = tcp_v4_init_sock,
3029 .destroy = tcp_v4_destroy_sock,
3030 .shutdown = tcp_shutdown,
3031 .setsockopt = tcp_setsockopt,
3032 .getsockopt = tcp_getsockopt,
3033 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3034 .keepalive = tcp_set_keepalive,
3035 .recvmsg = tcp_recvmsg,
3036 .sendmsg = tcp_sendmsg,
3037 .sendpage = tcp_sendpage,
3038 .backlog_rcv = tcp_v4_do_rcv,
3039 .release_cb = tcp_release_cb,
3041 .unhash = inet_unhash,
3042 .get_port = inet_csk_get_port,
3043 .put_port = inet_put_port,
3044 #ifdef CONFIG_BPF_SYSCALL
3045 .psock_update_sk_prot = tcp_bpf_update_proto,
3047 .enter_memory_pressure = tcp_enter_memory_pressure,
3048 .leave_memory_pressure = tcp_leave_memory_pressure,
3049 .stream_memory_free = tcp_stream_memory_free,
3050 .sockets_allocated = &tcp_sockets_allocated,
3051 .orphan_count = &tcp_orphan_count,
3052 .memory_allocated = &tcp_memory_allocated,
3053 .memory_pressure = &tcp_memory_pressure,
3054 .sysctl_mem = sysctl_tcp_mem,
3055 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3056 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3057 .max_header = MAX_TCP_HEADER,
3058 .obj_size = sizeof(struct tcp_sock),
3059 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3060 .twsk_prot = &tcp_timewait_sock_ops,
3061 .rsk_prot = &tcp_request_sock_ops,
3062 .h.hashinfo = &tcp_hashinfo,
3063 .no_autobind = true,
3064 .diag_destroy = tcp_abort,
3066 EXPORT_SYMBOL(tcp_prot);
3068 static void __net_exit tcp_sk_exit(struct net *net)
3070 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3072 if (net->ipv4.tcp_congestion_control)
3073 bpf_module_put(net->ipv4.tcp_congestion_control,
3074 net->ipv4.tcp_congestion_control->owner);
3075 if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3076 kfree(tcp_death_row);
3079 static int __net_init tcp_sk_init(struct net *net)
3083 net->ipv4.sysctl_tcp_ecn = 2;
3084 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3086 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3087 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3088 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3089 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3090 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3092 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3093 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3094 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3096 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3097 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3098 net->ipv4.sysctl_tcp_syncookies = 1;
3099 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3100 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3101 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3102 net->ipv4.sysctl_tcp_orphan_retries = 0;
3103 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3104 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3105 net->ipv4.sysctl_tcp_tw_reuse = 2;
3106 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3108 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3109 if (!net->ipv4.tcp_death_row)
3111 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3112 cnt = tcp_hashinfo.ehash_mask + 1;
3113 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3114 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3116 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3117 net->ipv4.sysctl_tcp_sack = 1;
3118 net->ipv4.sysctl_tcp_window_scaling = 1;
3119 net->ipv4.sysctl_tcp_timestamps = 1;
3120 net->ipv4.sysctl_tcp_early_retrans = 3;
3121 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3122 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3123 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3124 net->ipv4.sysctl_tcp_max_reordering = 300;
3125 net->ipv4.sysctl_tcp_dsack = 1;
3126 net->ipv4.sysctl_tcp_app_win = 31;
3127 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3128 net->ipv4.sysctl_tcp_frto = 2;
3129 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3130 /* This limits the percentage of the congestion window which we
3131 * will allow a single TSO frame to consume. Building TSO frames
3132 * which are too large can cause TCP streams to be bursty.
3134 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3135 /* Default TSQ limit of 16 TSO segments */
3136 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3137 /* rfc5961 challenge ack rate limiting */
3138 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3139 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3140 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3141 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3142 net->ipv4.sysctl_tcp_autocorking = 1;
3143 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3144 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3145 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3146 if (net != &init_net) {
3147 memcpy(net->ipv4.sysctl_tcp_rmem,
3148 init_net.ipv4.sysctl_tcp_rmem,
3149 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3150 memcpy(net->ipv4.sysctl_tcp_wmem,
3151 init_net.ipv4.sysctl_tcp_wmem,
3152 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3154 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3155 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3156 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3157 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3158 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3159 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3161 /* Reno is always built in */
3162 if (!net_eq(net, &init_net) &&
3163 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3164 init_net.ipv4.tcp_congestion_control->owner))
3165 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3167 net->ipv4.tcp_congestion_control = &tcp_reno;
3172 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3176 list_for_each_entry(net, net_exit_list, exit_list)
3177 tcp_fastopen_ctx_destroy(net);
3180 static struct pernet_operations __net_initdata tcp_sk_ops = {
3181 .init = tcp_sk_init,
3182 .exit = tcp_sk_exit,
3183 .exit_batch = tcp_sk_exit_batch,
3186 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3187 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3188 struct sock_common *sk_common, uid_t uid)
3190 #define INIT_BATCH_SZ 16
3192 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3194 struct bpf_tcp_iter_state *iter = priv_data;
3197 err = bpf_iter_init_seq_net(priv_data, aux);
3201 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3203 bpf_iter_fini_seq_net(priv_data);
3210 static void bpf_iter_fini_tcp(void *priv_data)
3212 struct bpf_tcp_iter_state *iter = priv_data;
3214 bpf_iter_fini_seq_net(priv_data);
3215 kvfree(iter->batch);
3218 static const struct bpf_iter_seq_info tcp_seq_info = {
3219 .seq_ops = &bpf_iter_tcp_seq_ops,
3220 .init_seq_private = bpf_iter_init_tcp,
3221 .fini_seq_private = bpf_iter_fini_tcp,
3222 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3225 static const struct bpf_func_proto *
3226 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3227 const struct bpf_prog *prog)
3230 case BPF_FUNC_setsockopt:
3231 return &bpf_sk_setsockopt_proto;
3232 case BPF_FUNC_getsockopt:
3233 return &bpf_sk_getsockopt_proto;
3239 static struct bpf_iter_reg tcp_reg_info = {
3241 .ctx_arg_info_size = 1,
3243 { offsetof(struct bpf_iter__tcp, sk_common),
3244 PTR_TO_BTF_ID_OR_NULL },
3246 .get_func_proto = bpf_iter_tcp_get_func_proto,
3247 .seq_info = &tcp_seq_info,
3250 static void __init bpf_iter_register(void)
3252 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3253 if (bpf_iter_reg_target(&tcp_reg_info))
3254 pr_warn("Warning: could not register bpf iterator tcp\n");
3259 void __init tcp_v4_init(void)
3263 for_each_possible_cpu(cpu) {
3266 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3267 IPPROTO_TCP, &init_net);
3269 panic("Failed to create the TCP control socket.\n");
3270 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3272 /* Please enforce IP_DF and IPID==0 for RST and
3273 * ACK sent in SYN-RECV and TIME-WAIT state.
3275 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3277 per_cpu(ipv4_tcp_sk, cpu) = sk;
3279 if (register_pernet_subsys(&tcp_sk_ops))
3280 panic("Failed to create the TCP control socket.\n");
3282 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3283 bpf_iter_register();