2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95 __be32 daddr, __be32 saddr, const struct tcphdr *th);
98 struct inet_hashinfo tcp_hashinfo;
99 EXPORT_SYMBOL(tcp_hashinfo);
101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
103 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106 tcp_hdr(skb)->source);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 struct tcp_sock *tp = tcp_sk(sk);
114 /* With PAWS, it is safe from the viewpoint
115 of data integrity. Even without PAWS it is safe provided sequence
116 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118 Actually, the idea is close to VJ's one, only timestamp cache is
119 held not per host, but per port pair and TW bucket is used as state
122 If TW bucket has been already destroyed we fall back to VJ's scheme
123 and use initial timestamp retrieved from peer table.
125 if (tcptw->tw_ts_recent_stamp &&
126 (twp == NULL || (sysctl_tcp_tw_reuse &&
127 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 if (tp->write_seq == 0)
131 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
132 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
141 static int tcp_repair_connect(struct sock *sk)
143 tcp_connect_init(sk);
144 tcp_finish_connect(sk, NULL);
149 /* This will initiate an outgoing connection. */
150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
152 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153 struct inet_sock *inet = inet_sk(sk);
154 struct tcp_sock *tp = tcp_sk(sk);
155 __be16 orig_sport, orig_dport;
156 __be32 daddr, nexthop;
160 struct ip_options_rcu *inet_opt;
162 if (addr_len < sizeof(struct sockaddr_in))
165 if (usin->sin_family != AF_INET)
166 return -EAFNOSUPPORT;
168 nexthop = daddr = usin->sin_addr.s_addr;
169 inet_opt = rcu_dereference_protected(inet->inet_opt,
170 sock_owned_by_user(sk));
171 if (inet_opt && inet_opt->opt.srr) {
174 nexthop = inet_opt->opt.faddr;
177 orig_sport = inet->inet_sport;
178 orig_dport = usin->sin_port;
179 fl4 = &inet->cork.fl.u.ip4;
180 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
181 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
183 orig_sport, orig_dport, sk, true);
186 if (err == -ENETUNREACH)
187 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
191 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
196 if (!inet_opt || !inet_opt->opt.srr)
199 if (!inet->inet_saddr)
200 inet->inet_saddr = fl4->saddr;
201 inet->inet_rcv_saddr = inet->inet_saddr;
203 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
204 /* Reset inherited state */
205 tp->rx_opt.ts_recent = 0;
206 tp->rx_opt.ts_recent_stamp = 0;
207 if (likely(!tp->repair))
211 if (tcp_death_row.sysctl_tw_recycle &&
212 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
213 tcp_fetch_timewait_stamp(sk, &rt->dst);
215 inet->inet_dport = usin->sin_port;
216 inet->inet_daddr = daddr;
218 inet_csk(sk)->icsk_ext_hdr_len = 0;
220 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
222 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
224 /* Socket identity is still unknown (sport may be zero).
225 * However we set state to SYN-SENT and not releasing socket
226 * lock select source port, enter ourselves into the hash tables and
227 * complete initialization after this.
229 tcp_set_state(sk, TCP_SYN_SENT);
230 err = inet_hash_connect(&tcp_death_row, sk);
234 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
235 inet->inet_sport, inet->inet_dport, sk);
241 /* OK, now commit destination to socket. */
242 sk->sk_gso_type = SKB_GSO_TCPV4;
243 sk_setup_caps(sk, &rt->dst);
245 if (!tp->write_seq && likely(!tp->repair))
246 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
251 inet->inet_id = tp->write_seq ^ jiffies;
253 if (likely(!tp->repair))
254 err = tcp_connect(sk);
256 err = tcp_repair_connect(sk);
266 * This unhashes the socket and releases the local port,
269 tcp_set_state(sk, TCP_CLOSE);
271 sk->sk_route_caps = 0;
272 inet->inet_dport = 0;
275 EXPORT_SYMBOL(tcp_v4_connect);
278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279 * It can be called through tcp_release_cb() if socket was owned by user
280 * at the time tcp_v4_err() was called to handle ICMP message.
282 static void tcp_v4_mtu_reduced(struct sock *sk)
284 struct dst_entry *dst;
285 struct inet_sock *inet = inet_sk(sk);
286 u32 mtu = tcp_sk(sk)->mtu_info;
288 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
289 * send out by Linux are always <576bytes so they should go through
292 if (sk->sk_state == TCP_LISTEN)
295 dst = inet_csk_update_pmtu(sk, mtu);
299 /* Something is about to be wrong... Remember soft error
300 * for the case, if this connection will not able to recover.
302 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303 sk->sk_err_soft = EMSGSIZE;
307 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
308 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
309 tcp_sync_mss(sk, mtu);
311 /* Resend the TCP packet because it's
312 * clear that the old packet has been
313 * dropped. This is the new "fast" path mtu
316 tcp_simple_retransmit(sk);
317 } /* else let the usual retransmit timer handle it */
320 static void do_redirect(struct sk_buff *skb, struct sock *sk)
322 struct dst_entry *dst = __sk_dst_check(sk, 0);
325 dst->ops->redirect(dst, sk, skb);
329 * This routine is called by the ICMP module when it gets some
330 * sort of error condition. If err < 0 then the socket should
331 * be closed and the error returned to the user. If err > 0
332 * it's just the icmp type << 8 | icmp code. After adjustment
333 * header points to the first 8 bytes of the tcp header. We need
334 * to find the appropriate port.
336 * The locking strategy used here is very "optimistic". When
337 * someone else accesses the socket the ICMP is just dropped
338 * and for some paths there is no check at all.
339 * A more general error queue to queue errors for later handling
340 * is probably better.
344 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
346 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
347 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
348 struct inet_connection_sock *icsk;
350 struct inet_sock *inet;
351 const int type = icmp_hdr(icmp_skb)->type;
352 const int code = icmp_hdr(icmp_skb)->code;
358 struct net *net = dev_net(icmp_skb->dev);
360 if (icmp_skb->len < (iph->ihl << 2) + 8) {
361 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
366 iph->saddr, th->source, inet_iif(icmp_skb));
368 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
371 if (sk->sk_state == TCP_TIME_WAIT) {
372 inet_twsk_put(inet_twsk(sk));
377 /* If too many ICMPs get dropped on busy
378 * servers this needs to be solved differently.
379 * We do take care of PMTU discovery (RFC1191) special case :
380 * we can receive locally generated ICMP messages while socket is held.
382 if (sock_owned_by_user(sk) &&
383 type != ICMP_DEST_UNREACH &&
384 code != ICMP_FRAG_NEEDED)
385 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
387 if (sk->sk_state == TCP_CLOSE)
390 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
391 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
397 seq = ntohl(th->seq);
398 if (sk->sk_state != TCP_LISTEN &&
399 !between(seq, tp->snd_una, tp->snd_nxt)) {
400 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
406 do_redirect(icmp_skb, sk);
408 case ICMP_SOURCE_QUENCH:
409 /* Just silently ignore these. */
411 case ICMP_PARAMETERPROB:
414 case ICMP_DEST_UNREACH:
415 if (code > NR_ICMP_UNREACH)
418 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
420 if (!sock_owned_by_user(sk)) {
421 tcp_v4_mtu_reduced(sk);
423 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
429 err = icmp_err_convert[code].errno;
430 /* check if icmp_skb allows revert of backoff
431 * (see draft-zimmermann-tcp-lcd) */
432 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
434 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
438 if (sock_owned_by_user(sk))
441 icsk->icsk_backoff--;
442 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
443 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
446 skb = tcp_write_queue_head(sk);
449 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
450 tcp_time_stamp - TCP_SKB_CB(skb)->when);
453 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
454 remaining, TCP_RTO_MAX);
456 /* RTO revert clocked out retransmission.
457 * Will retransmit now */
458 tcp_retransmit_timer(sk);
462 case ICMP_TIME_EXCEEDED:
469 switch (sk->sk_state) {
470 struct request_sock *req, **prev;
472 if (sock_owned_by_user(sk))
475 req = inet_csk_search_req(sk, &prev, th->dest,
476 iph->daddr, iph->saddr);
480 /* ICMPs are not backlogged, hence we cannot get
481 an established socket here.
485 if (seq != tcp_rsk(req)->snt_isn) {
486 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
491 * Still in SYN_RECV, just remove it silently.
492 * There is no good way to pass the error to the newly
493 * created socket, and POSIX does not want network
494 * errors returned from accept().
496 inet_csk_reqsk_queue_drop(sk, req, prev);
500 case TCP_SYN_RECV: /* Cannot happen.
501 It can f.e. if SYNs crossed.
503 if (!sock_owned_by_user(sk)) {
506 sk->sk_error_report(sk);
510 sk->sk_err_soft = err;
515 /* If we've already connected we will keep trying
516 * until we time out, or the user gives up.
518 * rfc1122 4.2.3.9 allows to consider as hard errors
519 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
520 * but it is obsoleted by pmtu discovery).
522 * Note, that in modern internet, where routing is unreliable
523 * and in each dark corner broken firewalls sit, sending random
524 * errors ordered by their masters even this two messages finally lose
525 * their original sense (even Linux sends invalid PORT_UNREACHs)
527 * Now we are in compliance with RFCs.
532 if (!sock_owned_by_user(sk) && inet->recverr) {
534 sk->sk_error_report(sk);
535 } else { /* Only an error on timeout */
536 sk->sk_err_soft = err;
544 static void __tcp_v4_send_check(struct sk_buff *skb,
545 __be32 saddr, __be32 daddr)
547 struct tcphdr *th = tcp_hdr(skb);
549 if (skb->ip_summed == CHECKSUM_PARTIAL) {
550 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
551 skb->csum_start = skb_transport_header(skb) - skb->head;
552 skb->csum_offset = offsetof(struct tcphdr, check);
554 th->check = tcp_v4_check(skb->len, saddr, daddr,
561 /* This routine computes an IPv4 TCP checksum. */
562 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
564 const struct inet_sock *inet = inet_sk(sk);
566 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
568 EXPORT_SYMBOL(tcp_v4_send_check);
570 int tcp_v4_gso_send_check(struct sk_buff *skb)
572 const struct iphdr *iph;
575 if (!pskb_may_pull(skb, sizeof(*th)))
582 skb->ip_summed = CHECKSUM_PARTIAL;
583 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
588 * This routine will send an RST to the other tcp.
590 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
592 * Answer: if a packet caused RST, it is not for a socket
593 * existing in our system, if it is matched to a socket,
594 * it is just duplicate segment or bug in other side's TCP.
595 * So that we build reply only basing on parameters
596 * arrived with segment.
597 * Exception: precedence violation. We do not implement it in any case.
600 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
602 const struct tcphdr *th = tcp_hdr(skb);
605 #ifdef CONFIG_TCP_MD5SIG
606 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
609 struct ip_reply_arg arg;
610 #ifdef CONFIG_TCP_MD5SIG
611 struct tcp_md5sig_key *key;
612 const __u8 *hash_location = NULL;
613 unsigned char newhash[16];
615 struct sock *sk1 = NULL;
619 /* Never send a reset in response to a reset. */
623 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
626 /* Swap the send and the receive. */
627 memset(&rep, 0, sizeof(rep));
628 rep.th.dest = th->source;
629 rep.th.source = th->dest;
630 rep.th.doff = sizeof(struct tcphdr) / 4;
634 rep.th.seq = th->ack_seq;
637 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
638 skb->len - (th->doff << 2));
641 memset(&arg, 0, sizeof(arg));
642 arg.iov[0].iov_base = (unsigned char *)&rep;
643 arg.iov[0].iov_len = sizeof(rep.th);
645 #ifdef CONFIG_TCP_MD5SIG
646 hash_location = tcp_parse_md5sig_option(th);
647 if (!sk && hash_location) {
649 * active side is lost. Try to find listening socket through
650 * source port, and then find md5 key through listening socket.
651 * we are not loose security here:
652 * Incoming packet is checked with md5 hash with finding key,
653 * no RST generated if md5 hash doesn't match.
655 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
656 &tcp_hashinfo, ip_hdr(skb)->daddr,
657 ntohs(th->source), inet_iif(skb));
658 /* don't send rst if it can't find key */
662 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
663 &ip_hdr(skb)->saddr, AF_INET);
667 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
668 if (genhash || memcmp(hash_location, newhash, 16) != 0)
671 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
677 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
679 (TCPOPT_MD5SIG << 8) |
681 /* Update length and the length the header thinks exists */
682 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
683 rep.th.doff = arg.iov[0].iov_len / 4;
685 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
686 key, ip_hdr(skb)->saddr,
687 ip_hdr(skb)->daddr, &rep.th);
690 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
691 ip_hdr(skb)->saddr, /* XXX */
692 arg.iov[0].iov_len, IPPROTO_TCP, 0);
693 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
694 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
695 /* When socket is gone, all binding information is lost.
696 * routing might fail in this case. using iif for oif to
697 * make sure we can deliver it
699 arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
701 net = dev_net(skb_dst(skb)->dev);
702 arg.tos = ip_hdr(skb)->tos;
703 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
704 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
706 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
707 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
709 #ifdef CONFIG_TCP_MD5SIG
718 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
719 outside socket context is ugly, certainly. What can I do?
722 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
723 u32 win, u32 ts, int oif,
724 struct tcp_md5sig_key *key,
725 int reply_flags, u8 tos)
727 const struct tcphdr *th = tcp_hdr(skb);
730 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
731 #ifdef CONFIG_TCP_MD5SIG
732 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
736 struct ip_reply_arg arg;
737 struct net *net = dev_net(skb_dst(skb)->dev);
739 memset(&rep.th, 0, sizeof(struct tcphdr));
740 memset(&arg, 0, sizeof(arg));
742 arg.iov[0].iov_base = (unsigned char *)&rep;
743 arg.iov[0].iov_len = sizeof(rep.th);
745 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
746 (TCPOPT_TIMESTAMP << 8) |
748 rep.opt[1] = htonl(tcp_time_stamp);
749 rep.opt[2] = htonl(ts);
750 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
753 /* Swap the send and the receive. */
754 rep.th.dest = th->source;
755 rep.th.source = th->dest;
756 rep.th.doff = arg.iov[0].iov_len / 4;
757 rep.th.seq = htonl(seq);
758 rep.th.ack_seq = htonl(ack);
760 rep.th.window = htons(win);
762 #ifdef CONFIG_TCP_MD5SIG
764 int offset = (ts) ? 3 : 0;
766 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
768 (TCPOPT_MD5SIG << 8) |
770 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
771 rep.th.doff = arg.iov[0].iov_len/4;
773 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
774 key, ip_hdr(skb)->saddr,
775 ip_hdr(skb)->daddr, &rep.th);
778 arg.flags = reply_flags;
779 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
780 ip_hdr(skb)->saddr, /* XXX */
781 arg.iov[0].iov_len, IPPROTO_TCP, 0);
782 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
784 arg.bound_dev_if = oif;
786 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
787 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
789 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
792 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
794 struct inet_timewait_sock *tw = inet_twsk(sk);
795 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
797 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
798 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
801 tcp_twsk_md5_key(tcptw),
802 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
809 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
810 struct request_sock *req)
812 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
813 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
816 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
818 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
823 * Send a SYN-ACK after having received a SYN.
824 * This still operates on a request_sock only, not on a big
827 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
828 struct request_sock *req,
829 struct request_values *rvp,
833 const struct inet_request_sock *ireq = inet_rsk(req);
836 struct sk_buff * skb;
838 /* First, grab a route. */
839 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
842 skb = tcp_make_synack(sk, dst, req, rvp);
845 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
847 skb_set_queue_mapping(skb, queue_mapping);
848 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
851 err = net_xmit_eval(err);
857 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
858 struct request_values *rvp)
860 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
861 return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
865 * IPv4 request_sock destructor.
867 static void tcp_v4_reqsk_destructor(struct request_sock *req)
869 kfree(inet_rsk(req)->opt);
873 * Return true if a syncookie should be sent
875 bool tcp_syn_flood_action(struct sock *sk,
876 const struct sk_buff *skb,
879 const char *msg = "Dropping request";
880 bool want_cookie = false;
881 struct listen_sock *lopt;
885 #ifdef CONFIG_SYN_COOKIES
886 if (sysctl_tcp_syncookies) {
887 msg = "Sending cookies";
889 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
892 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
894 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
895 if (!lopt->synflood_warned) {
896 lopt->synflood_warned = 1;
897 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
898 proto, ntohs(tcp_hdr(skb)->dest), msg);
902 EXPORT_SYMBOL(tcp_syn_flood_action);
905 * Save and compile IPv4 options into the request_sock if needed.
907 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
910 const struct ip_options *opt = &(IPCB(skb)->opt);
911 struct ip_options_rcu *dopt = NULL;
913 if (opt && opt->optlen) {
914 int opt_size = sizeof(*dopt) + opt->optlen;
916 dopt = kmalloc(opt_size, GFP_ATOMIC);
918 if (ip_options_echo(&dopt->opt, skb)) {
927 #ifdef CONFIG_TCP_MD5SIG
929 * RFC2385 MD5 checksumming requires a mapping of
930 * IP address->MD5 Key.
931 * We need to maintain these in the sk structure.
934 /* Find the Key structure for an address. */
935 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
936 const union tcp_md5_addr *addr,
939 struct tcp_sock *tp = tcp_sk(sk);
940 struct tcp_md5sig_key *key;
941 struct hlist_node *pos;
942 unsigned int size = sizeof(struct in_addr);
943 struct tcp_md5sig_info *md5sig;
945 /* caller either holds rcu_read_lock() or socket lock */
946 md5sig = rcu_dereference_check(tp->md5sig_info,
947 sock_owned_by_user(sk) ||
948 lockdep_is_held(&sk->sk_lock.slock));
951 #if IS_ENABLED(CONFIG_IPV6)
952 if (family == AF_INET6)
953 size = sizeof(struct in6_addr);
955 hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
956 if (key->family != family)
958 if (!memcmp(&key->addr, addr, size))
963 EXPORT_SYMBOL(tcp_md5_do_lookup);
965 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
966 struct sock *addr_sk)
968 union tcp_md5_addr *addr;
970 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
971 return tcp_md5_do_lookup(sk, addr, AF_INET);
973 EXPORT_SYMBOL(tcp_v4_md5_lookup);
975 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
976 struct request_sock *req)
978 union tcp_md5_addr *addr;
980 addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
981 return tcp_md5_do_lookup(sk, addr, AF_INET);
984 /* This can be called on a newly created socket, from other files */
985 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
986 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
988 /* Add Key to the list */
989 struct tcp_md5sig_key *key;
990 struct tcp_sock *tp = tcp_sk(sk);
991 struct tcp_md5sig_info *md5sig;
993 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
995 /* Pre-existing entry - just update that one. */
996 memcpy(key->key, newkey, newkeylen);
997 key->keylen = newkeylen;
1001 md5sig = rcu_dereference_protected(tp->md5sig_info,
1002 sock_owned_by_user(sk));
1004 md5sig = kmalloc(sizeof(*md5sig), gfp);
1008 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1009 INIT_HLIST_HEAD(&md5sig->head);
1010 rcu_assign_pointer(tp->md5sig_info, md5sig);
1013 key = sock_kmalloc(sk, sizeof(*key), gfp);
1016 if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1017 sock_kfree_s(sk, key, sizeof(*key));
1021 memcpy(key->key, newkey, newkeylen);
1022 key->keylen = newkeylen;
1023 key->family = family;
1024 memcpy(&key->addr, addr,
1025 (family == AF_INET6) ? sizeof(struct in6_addr) :
1026 sizeof(struct in_addr));
1027 hlist_add_head_rcu(&key->node, &md5sig->head);
1030 EXPORT_SYMBOL(tcp_md5_do_add);
1032 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1034 struct tcp_sock *tp = tcp_sk(sk);
1035 struct tcp_md5sig_key *key;
1036 struct tcp_md5sig_info *md5sig;
1038 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1041 hlist_del_rcu(&key->node);
1042 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1043 kfree_rcu(key, rcu);
1044 md5sig = rcu_dereference_protected(tp->md5sig_info,
1045 sock_owned_by_user(sk));
1046 if (hlist_empty(&md5sig->head))
1047 tcp_free_md5sig_pool();
1050 EXPORT_SYMBOL(tcp_md5_do_del);
1052 void tcp_clear_md5_list(struct sock *sk)
1054 struct tcp_sock *tp = tcp_sk(sk);
1055 struct tcp_md5sig_key *key;
1056 struct hlist_node *pos, *n;
1057 struct tcp_md5sig_info *md5sig;
1059 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1061 if (!hlist_empty(&md5sig->head))
1062 tcp_free_md5sig_pool();
1063 hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1064 hlist_del_rcu(&key->node);
1065 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1066 kfree_rcu(key, rcu);
1070 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1073 struct tcp_md5sig cmd;
1074 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1076 if (optlen < sizeof(cmd))
1079 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1082 if (sin->sin_family != AF_INET)
1085 if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1086 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1089 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1092 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1093 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1097 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1098 __be32 daddr, __be32 saddr, int nbytes)
1100 struct tcp4_pseudohdr *bp;
1101 struct scatterlist sg;
1103 bp = &hp->md5_blk.ip4;
1106 * 1. the TCP pseudo-header (in the order: source IP address,
1107 * destination IP address, zero-padded protocol number, and
1113 bp->protocol = IPPROTO_TCP;
1114 bp->len = cpu_to_be16(nbytes);
1116 sg_init_one(&sg, bp, sizeof(*bp));
1117 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1120 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1121 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1123 struct tcp_md5sig_pool *hp;
1124 struct hash_desc *desc;
1126 hp = tcp_get_md5sig_pool();
1128 goto clear_hash_noput;
1129 desc = &hp->md5_desc;
1131 if (crypto_hash_init(desc))
1133 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1135 if (tcp_md5_hash_header(hp, th))
1137 if (tcp_md5_hash_key(hp, key))
1139 if (crypto_hash_final(desc, md5_hash))
1142 tcp_put_md5sig_pool();
1146 tcp_put_md5sig_pool();
1148 memset(md5_hash, 0, 16);
1152 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1153 const struct sock *sk, const struct request_sock *req,
1154 const struct sk_buff *skb)
1156 struct tcp_md5sig_pool *hp;
1157 struct hash_desc *desc;
1158 const struct tcphdr *th = tcp_hdr(skb);
1159 __be32 saddr, daddr;
1162 saddr = inet_sk(sk)->inet_saddr;
1163 daddr = inet_sk(sk)->inet_daddr;
1165 saddr = inet_rsk(req)->loc_addr;
1166 daddr = inet_rsk(req)->rmt_addr;
1168 const struct iphdr *iph = ip_hdr(skb);
1173 hp = tcp_get_md5sig_pool();
1175 goto clear_hash_noput;
1176 desc = &hp->md5_desc;
1178 if (crypto_hash_init(desc))
1181 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1183 if (tcp_md5_hash_header(hp, th))
1185 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1187 if (tcp_md5_hash_key(hp, key))
1189 if (crypto_hash_final(desc, md5_hash))
1192 tcp_put_md5sig_pool();
1196 tcp_put_md5sig_pool();
1198 memset(md5_hash, 0, 16);
1201 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1203 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1206 * This gets called for each TCP segment that arrives
1207 * so we want to be efficient.
1208 * We have 3 drop cases:
1209 * o No MD5 hash and one expected.
1210 * o MD5 hash and we're not expecting one.
1211 * o MD5 hash and its wrong.
1213 const __u8 *hash_location = NULL;
1214 struct tcp_md5sig_key *hash_expected;
1215 const struct iphdr *iph = ip_hdr(skb);
1216 const struct tcphdr *th = tcp_hdr(skb);
1218 unsigned char newhash[16];
1220 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1222 hash_location = tcp_parse_md5sig_option(th);
1224 /* We've parsed the options - do we have a hash? */
1225 if (!hash_expected && !hash_location)
1228 if (hash_expected && !hash_location) {
1229 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1233 if (!hash_expected && hash_location) {
1234 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1238 /* Okay, so this is hash_expected and hash_location -
1239 * so we need to calculate the checksum.
1241 genhash = tcp_v4_md5_hash_skb(newhash,
1245 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1246 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1247 &iph->saddr, ntohs(th->source),
1248 &iph->daddr, ntohs(th->dest),
1249 genhash ? " tcp_v4_calc_md5_hash failed"
1258 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1260 .obj_size = sizeof(struct tcp_request_sock),
1261 .rtx_syn_ack = tcp_v4_rtx_synack,
1262 .send_ack = tcp_v4_reqsk_send_ack,
1263 .destructor = tcp_v4_reqsk_destructor,
1264 .send_reset = tcp_v4_send_reset,
1265 .syn_ack_timeout = tcp_syn_ack_timeout,
1268 #ifdef CONFIG_TCP_MD5SIG
1269 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1270 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1271 .calc_md5_hash = tcp_v4_md5_hash_skb,
1275 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1277 struct tcp_extend_values tmp_ext;
1278 struct tcp_options_received tmp_opt;
1279 const u8 *hash_location;
1280 struct request_sock *req;
1281 struct inet_request_sock *ireq;
1282 struct tcp_sock *tp = tcp_sk(sk);
1283 struct dst_entry *dst = NULL;
1284 __be32 saddr = ip_hdr(skb)->saddr;
1285 __be32 daddr = ip_hdr(skb)->daddr;
1286 __u32 isn = TCP_SKB_CB(skb)->when;
1287 bool want_cookie = false;
1289 /* Never answer to SYNs send to broadcast or multicast */
1290 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1293 /* TW buckets are converted to open requests without
1294 * limitations, they conserve resources and peer is
1295 * evidently real one.
1297 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1298 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1303 /* Accept backlog is full. If we have already queued enough
1304 * of warm entries in syn queue, drop request. It is better than
1305 * clogging syn queue with openreqs with exponentially increasing
1308 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1311 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1315 #ifdef CONFIG_TCP_MD5SIG
1316 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1319 tcp_clear_options(&tmp_opt);
1320 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1321 tmp_opt.user_mss = tp->rx_opt.user_mss;
1322 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
1324 if (tmp_opt.cookie_plus > 0 &&
1325 tmp_opt.saw_tstamp &&
1326 !tp->rx_opt.cookie_out_never &&
1327 (sysctl_tcp_cookie_size > 0 ||
1328 (tp->cookie_values != NULL &&
1329 tp->cookie_values->cookie_desired > 0))) {
1331 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1332 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1334 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1335 goto drop_and_release;
1337 /* Secret recipe starts with IP addresses */
1338 *mess++ ^= (__force u32)daddr;
1339 *mess++ ^= (__force u32)saddr;
1341 /* plus variable length Initiator Cookie */
1344 *c++ ^= *hash_location++;
1346 want_cookie = false; /* not our kind of cookie */
1347 tmp_ext.cookie_out_never = 0; /* false */
1348 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1349 } else if (!tp->rx_opt.cookie_in_always) {
1350 /* redundant indications, but ensure initialization. */
1351 tmp_ext.cookie_out_never = 1; /* true */
1352 tmp_ext.cookie_plus = 0;
1354 goto drop_and_release;
1356 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1358 if (want_cookie && !tmp_opt.saw_tstamp)
1359 tcp_clear_options(&tmp_opt);
1361 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1362 tcp_openreq_init(req, &tmp_opt, skb);
1364 ireq = inet_rsk(req);
1365 ireq->loc_addr = daddr;
1366 ireq->rmt_addr = saddr;
1367 ireq->no_srccheck = inet_sk(sk)->transparent;
1368 ireq->opt = tcp_v4_save_options(sk, skb);
1370 if (security_inet_conn_request(sk, skb, req))
1373 if (!want_cookie || tmp_opt.tstamp_ok)
1374 TCP_ECN_create_request(req, skb);
1377 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1378 req->cookie_ts = tmp_opt.tstamp_ok;
1382 /* VJ's idea. We save last timestamp seen
1383 * from the destination in peer table, when entering
1384 * state TIME-WAIT, and check against it before
1385 * accepting new connection request.
1387 * If "isn" is not zero, this request hit alive
1388 * timewait bucket, so that all the necessary checks
1389 * are made in the function processing timewait state.
1391 if (tmp_opt.saw_tstamp &&
1392 tcp_death_row.sysctl_tw_recycle &&
1393 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1394 fl4.daddr == saddr) {
1395 if (!tcp_peer_is_proven(req, dst, true)) {
1396 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1397 goto drop_and_release;
1400 /* Kill the following clause, if you dislike this way. */
1401 else if (!sysctl_tcp_syncookies &&
1402 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1403 (sysctl_max_syn_backlog >> 2)) &&
1404 !tcp_peer_is_proven(req, dst, false)) {
1405 /* Without syncookies last quarter of
1406 * backlog is filled with destinations,
1407 * proven to be alive.
1408 * It means that we continue to communicate
1409 * to destinations, already remembered
1410 * to the moment of synflood.
1412 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1413 &saddr, ntohs(tcp_hdr(skb)->source));
1414 goto drop_and_release;
1417 isn = tcp_v4_init_sequence(skb);
1419 tcp_rsk(req)->snt_isn = isn;
1420 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1422 if (tcp_v4_send_synack(sk, dst, req,
1423 (struct request_values *)&tmp_ext,
1424 skb_get_queue_mapping(skb),
1429 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1439 EXPORT_SYMBOL(tcp_v4_conn_request);
1443 * The three way handshake has completed - we got a valid synack -
1444 * now create the new socket.
1446 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1447 struct request_sock *req,
1448 struct dst_entry *dst)
1450 struct inet_request_sock *ireq;
1451 struct inet_sock *newinet;
1452 struct tcp_sock *newtp;
1454 #ifdef CONFIG_TCP_MD5SIG
1455 struct tcp_md5sig_key *key;
1457 struct ip_options_rcu *inet_opt;
1459 if (sk_acceptq_is_full(sk))
1462 newsk = tcp_create_openreq_child(sk, req, skb);
1466 newsk->sk_gso_type = SKB_GSO_TCPV4;
1467 inet_sk_rx_dst_set(newsk, skb);
1469 newtp = tcp_sk(newsk);
1470 newinet = inet_sk(newsk);
1471 ireq = inet_rsk(req);
1472 newinet->inet_daddr = ireq->rmt_addr;
1473 newinet->inet_rcv_saddr = ireq->loc_addr;
1474 newinet->inet_saddr = ireq->loc_addr;
1475 inet_opt = ireq->opt;
1476 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1478 newinet->mc_index = inet_iif(skb);
1479 newinet->mc_ttl = ip_hdr(skb)->ttl;
1480 newinet->rcv_tos = ip_hdr(skb)->tos;
1481 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1483 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1484 newinet->inet_id = newtp->write_seq ^ jiffies;
1487 dst = inet_csk_route_child_sock(sk, newsk, req);
1491 /* syncookie case : see end of cookie_v4_check() */
1493 sk_setup_caps(newsk, dst);
1495 tcp_mtup_init(newsk);
1496 tcp_sync_mss(newsk, dst_mtu(dst));
1497 newtp->advmss = dst_metric_advmss(dst);
1498 if (tcp_sk(sk)->rx_opt.user_mss &&
1499 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1500 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1502 tcp_initialize_rcv_mss(newsk);
1503 if (tcp_rsk(req)->snt_synack)
1504 tcp_valid_rtt_meas(newsk,
1505 tcp_time_stamp - tcp_rsk(req)->snt_synack);
1506 newtp->total_retrans = req->retrans;
1508 #ifdef CONFIG_TCP_MD5SIG
1509 /* Copy over the MD5 key from the original socket */
1510 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1514 * We're using one, so create a matching key
1515 * on the newsk structure. If we fail to get
1516 * memory, then we end up not copying the key
1519 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1520 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1521 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1525 if (__inet_inherit_port(sk, newsk) < 0)
1527 __inet_hash_nolisten(newsk, NULL);
1532 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1536 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1539 tcp_clear_xmit_timers(newsk);
1540 tcp_cleanup_congestion_control(newsk);
1541 bh_unlock_sock(newsk);
1545 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1547 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1549 struct tcphdr *th = tcp_hdr(skb);
1550 const struct iphdr *iph = ip_hdr(skb);
1552 struct request_sock **prev;
1553 /* Find possible connection requests. */
1554 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1555 iph->saddr, iph->daddr);
1557 return tcp_check_req(sk, skb, req, prev);
1559 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1560 th->source, iph->daddr, th->dest, inet_iif(skb));
1563 if (nsk->sk_state != TCP_TIME_WAIT) {
1567 inet_twsk_put(inet_twsk(nsk));
1571 #ifdef CONFIG_SYN_COOKIES
1573 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1578 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1580 const struct iphdr *iph = ip_hdr(skb);
1582 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1583 if (!tcp_v4_check(skb->len, iph->saddr,
1584 iph->daddr, skb->csum)) {
1585 skb->ip_summed = CHECKSUM_UNNECESSARY;
1590 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1591 skb->len, IPPROTO_TCP, 0);
1593 if (skb->len <= 76) {
1594 return __skb_checksum_complete(skb);
1600 /* The socket must have it's spinlock held when we get
1603 * We have a potential double-lock case here, so even when
1604 * doing backlog processing we use the BH locking scheme.
1605 * This is because we cannot sleep with the original spinlock
1608 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1611 #ifdef CONFIG_TCP_MD5SIG
1613 * We really want to reject the packet as early as possible
1615 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1616 * o There is an MD5 option and we're not expecting one
1618 if (tcp_v4_inbound_md5_hash(sk, skb))
1622 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1623 struct dst_entry *dst = sk->sk_rx_dst;
1625 sock_rps_save_rxhash(sk, skb);
1627 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1628 dst->ops->check(dst, 0) == NULL) {
1630 sk->sk_rx_dst = NULL;
1633 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1640 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1643 if (sk->sk_state == TCP_LISTEN) {
1644 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1649 sock_rps_save_rxhash(nsk, skb);
1650 if (tcp_child_process(sk, nsk, skb)) {
1657 sock_rps_save_rxhash(sk, skb);
1659 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1666 tcp_v4_send_reset(rsk, skb);
1669 /* Be careful here. If this function gets more complicated and
1670 * gcc suffers from register pressure on the x86, sk (in %ebx)
1671 * might be destroyed here. This current version compiles correctly,
1672 * but you have been warned.
1677 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1680 EXPORT_SYMBOL(tcp_v4_do_rcv);
1682 void tcp_v4_early_demux(struct sk_buff *skb)
1684 struct net *net = dev_net(skb->dev);
1685 const struct iphdr *iph;
1686 const struct tcphdr *th;
1689 if (skb->pkt_type != PACKET_HOST)
1692 if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1696 th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1698 if (th->doff < sizeof(struct tcphdr) / 4)
1701 sk = __inet_lookup_established(net, &tcp_hashinfo,
1702 iph->saddr, th->source,
1703 iph->daddr, ntohs(th->dest),
1707 skb->destructor = sock_edemux;
1708 if (sk->sk_state != TCP_TIME_WAIT) {
1709 struct dst_entry *dst = sk->sk_rx_dst;
1712 dst = dst_check(dst, 0);
1714 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1715 skb_dst_set_noref(skb, dst);
1724 int tcp_v4_rcv(struct sk_buff *skb)
1726 const struct iphdr *iph;
1727 const struct tcphdr *th;
1730 struct net *net = dev_net(skb->dev);
1732 if (skb->pkt_type != PACKET_HOST)
1735 /* Count it even if it's bad */
1736 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1738 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1743 if (th->doff < sizeof(struct tcphdr) / 4)
1745 if (!pskb_may_pull(skb, th->doff * 4))
1748 /* An explanation is required here, I think.
1749 * Packet length and doff are validated by header prediction,
1750 * provided case of th->doff==0 is eliminated.
1751 * So, we defer the checks. */
1752 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1757 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1758 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1759 skb->len - th->doff * 4);
1760 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1761 TCP_SKB_CB(skb)->when = 0;
1762 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1763 TCP_SKB_CB(skb)->sacked = 0;
1765 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1770 if (sk->sk_state == TCP_TIME_WAIT)
1773 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1774 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1775 goto discard_and_relse;
1778 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1779 goto discard_and_relse;
1782 if (sk_filter(sk, skb))
1783 goto discard_and_relse;
1787 bh_lock_sock_nested(sk);
1789 if (!sock_owned_by_user(sk)) {
1790 #ifdef CONFIG_NET_DMA
1791 struct tcp_sock *tp = tcp_sk(sk);
1792 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1793 tp->ucopy.dma_chan = net_dma_find_channel();
1794 if (tp->ucopy.dma_chan)
1795 ret = tcp_v4_do_rcv(sk, skb);
1799 if (!tcp_prequeue(sk, skb))
1800 ret = tcp_v4_do_rcv(sk, skb);
1802 } else if (unlikely(sk_add_backlog(sk, skb,
1803 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1805 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1806 goto discard_and_relse;
1815 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1818 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1820 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1822 tcp_v4_send_reset(NULL, skb);
1826 /* Discard frame. */
1835 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1836 inet_twsk_put(inet_twsk(sk));
1840 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1841 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1842 inet_twsk_put(inet_twsk(sk));
1845 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1847 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1849 iph->daddr, th->dest,
1852 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1853 inet_twsk_put(inet_twsk(sk));
1857 /* Fall through to ACK */
1860 tcp_v4_timewait_ack(sk, skb);
1864 case TCP_TW_SUCCESS:;
1869 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1870 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1871 .twsk_unique = tcp_twsk_unique,
1872 .twsk_destructor= tcp_twsk_destructor,
1875 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1877 struct dst_entry *dst = skb_dst(skb);
1880 sk->sk_rx_dst = dst;
1881 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1883 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1885 const struct inet_connection_sock_af_ops ipv4_specific = {
1886 .queue_xmit = ip_queue_xmit,
1887 .send_check = tcp_v4_send_check,
1888 .rebuild_header = inet_sk_rebuild_header,
1889 .sk_rx_dst_set = inet_sk_rx_dst_set,
1890 .conn_request = tcp_v4_conn_request,
1891 .syn_recv_sock = tcp_v4_syn_recv_sock,
1892 .net_header_len = sizeof(struct iphdr),
1893 .setsockopt = ip_setsockopt,
1894 .getsockopt = ip_getsockopt,
1895 .addr2sockaddr = inet_csk_addr2sockaddr,
1896 .sockaddr_len = sizeof(struct sockaddr_in),
1897 .bind_conflict = inet_csk_bind_conflict,
1898 #ifdef CONFIG_COMPAT
1899 .compat_setsockopt = compat_ip_setsockopt,
1900 .compat_getsockopt = compat_ip_getsockopt,
1903 EXPORT_SYMBOL(ipv4_specific);
1905 #ifdef CONFIG_TCP_MD5SIG
1906 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1907 .md5_lookup = tcp_v4_md5_lookup,
1908 .calc_md5_hash = tcp_v4_md5_hash_skb,
1909 .md5_parse = tcp_v4_parse_md5_keys,
1913 /* NOTE: A lot of things set to zero explicitly by call to
1914 * sk_alloc() so need not be done here.
1916 static int tcp_v4_init_sock(struct sock *sk)
1918 struct inet_connection_sock *icsk = inet_csk(sk);
1922 icsk->icsk_af_ops = &ipv4_specific;
1924 #ifdef CONFIG_TCP_MD5SIG
1925 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1931 void tcp_v4_destroy_sock(struct sock *sk)
1933 struct tcp_sock *tp = tcp_sk(sk);
1935 tcp_clear_xmit_timers(sk);
1937 tcp_cleanup_congestion_control(sk);
1939 /* Cleanup up the write buffer. */
1940 tcp_write_queue_purge(sk);
1942 /* Cleans up our, hopefully empty, out_of_order_queue. */
1943 __skb_queue_purge(&tp->out_of_order_queue);
1945 #ifdef CONFIG_TCP_MD5SIG
1946 /* Clean up the MD5 key list, if any */
1947 if (tp->md5sig_info) {
1948 tcp_clear_md5_list(sk);
1949 kfree_rcu(tp->md5sig_info, rcu);
1950 tp->md5sig_info = NULL;
1954 #ifdef CONFIG_NET_DMA
1955 /* Cleans up our sk_async_wait_queue */
1956 __skb_queue_purge(&sk->sk_async_wait_queue);
1959 /* Clean prequeue, it must be empty really */
1960 __skb_queue_purge(&tp->ucopy.prequeue);
1962 /* Clean up a referenced TCP bind bucket. */
1963 if (inet_csk(sk)->icsk_bind_hash)
1967 * If sendmsg cached page exists, toss it.
1969 if (sk->sk_sndmsg_page) {
1970 __free_page(sk->sk_sndmsg_page);
1971 sk->sk_sndmsg_page = NULL;
1974 /* TCP Cookie Transactions */
1975 if (tp->cookie_values != NULL) {
1976 kref_put(&tp->cookie_values->kref,
1977 tcp_cookie_values_release);
1978 tp->cookie_values = NULL;
1981 /* If socket is aborted during connect operation */
1982 tcp_free_fastopen_req(tp);
1984 sk_sockets_allocated_dec(sk);
1985 sock_release_memcg(sk);
1987 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1989 #ifdef CONFIG_PROC_FS
1990 /* Proc filesystem TCP sock list dumping. */
1992 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1994 return hlist_nulls_empty(head) ? NULL :
1995 list_entry(head->first, struct inet_timewait_sock, tw_node);
1998 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2000 return !is_a_nulls(tw->tw_node.next) ?
2001 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2005 * Get next listener socket follow cur. If cur is NULL, get first socket
2006 * starting from bucket given in st->bucket; when st->bucket is zero the
2007 * very first socket in the hash table is returned.
2009 static void *listening_get_next(struct seq_file *seq, void *cur)
2011 struct inet_connection_sock *icsk;
2012 struct hlist_nulls_node *node;
2013 struct sock *sk = cur;
2014 struct inet_listen_hashbucket *ilb;
2015 struct tcp_iter_state *st = seq->private;
2016 struct net *net = seq_file_net(seq);
2019 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2020 spin_lock_bh(&ilb->lock);
2021 sk = sk_nulls_head(&ilb->head);
2025 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2029 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2030 struct request_sock *req = cur;
2032 icsk = inet_csk(st->syn_wait_sk);
2036 if (req->rsk_ops->family == st->family) {
2042 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2045 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2047 sk = sk_nulls_next(st->syn_wait_sk);
2048 st->state = TCP_SEQ_STATE_LISTENING;
2049 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2051 icsk = inet_csk(sk);
2052 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2053 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2055 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2056 sk = sk_nulls_next(sk);
2059 sk_nulls_for_each_from(sk, node) {
2060 if (!net_eq(sock_net(sk), net))
2062 if (sk->sk_family == st->family) {
2066 icsk = inet_csk(sk);
2067 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2068 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2070 st->uid = sock_i_uid(sk);
2071 st->syn_wait_sk = sk;
2072 st->state = TCP_SEQ_STATE_OPENREQ;
2076 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2078 spin_unlock_bh(&ilb->lock);
2080 if (++st->bucket < INET_LHTABLE_SIZE) {
2081 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2082 spin_lock_bh(&ilb->lock);
2083 sk = sk_nulls_head(&ilb->head);
2091 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2093 struct tcp_iter_state *st = seq->private;
2098 rc = listening_get_next(seq, NULL);
2100 while (rc && *pos) {
2101 rc = listening_get_next(seq, rc);
2107 static inline bool empty_bucket(struct tcp_iter_state *st)
2109 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2110 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2114 * Get first established socket starting from bucket given in st->bucket.
2115 * If st->bucket is zero, the very first socket in the hash is returned.
2117 static void *established_get_first(struct seq_file *seq)
2119 struct tcp_iter_state *st = seq->private;
2120 struct net *net = seq_file_net(seq);
2124 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2126 struct hlist_nulls_node *node;
2127 struct inet_timewait_sock *tw;
2128 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2130 /* Lockless fast path for the common case of empty buckets */
2131 if (empty_bucket(st))
2135 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2136 if (sk->sk_family != st->family ||
2137 !net_eq(sock_net(sk), net)) {
2143 st->state = TCP_SEQ_STATE_TIME_WAIT;
2144 inet_twsk_for_each(tw, node,
2145 &tcp_hashinfo.ehash[st->bucket].twchain) {
2146 if (tw->tw_family != st->family ||
2147 !net_eq(twsk_net(tw), net)) {
2153 spin_unlock_bh(lock);
2154 st->state = TCP_SEQ_STATE_ESTABLISHED;
2160 static void *established_get_next(struct seq_file *seq, void *cur)
2162 struct sock *sk = cur;
2163 struct inet_timewait_sock *tw;
2164 struct hlist_nulls_node *node;
2165 struct tcp_iter_state *st = seq->private;
2166 struct net *net = seq_file_net(seq);
2171 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2175 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2182 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2183 st->state = TCP_SEQ_STATE_ESTABLISHED;
2185 /* Look for next non empty bucket */
2187 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2190 if (st->bucket > tcp_hashinfo.ehash_mask)
2193 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2194 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2196 sk = sk_nulls_next(sk);
2198 sk_nulls_for_each_from(sk, node) {
2199 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2203 st->state = TCP_SEQ_STATE_TIME_WAIT;
2204 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2212 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2214 struct tcp_iter_state *st = seq->private;
2218 rc = established_get_first(seq);
2221 rc = established_get_next(seq, rc);
2227 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2230 struct tcp_iter_state *st = seq->private;
2232 st->state = TCP_SEQ_STATE_LISTENING;
2233 rc = listening_get_idx(seq, &pos);
2236 st->state = TCP_SEQ_STATE_ESTABLISHED;
2237 rc = established_get_idx(seq, pos);
2243 static void *tcp_seek_last_pos(struct seq_file *seq)
2245 struct tcp_iter_state *st = seq->private;
2246 int offset = st->offset;
2247 int orig_num = st->num;
2250 switch (st->state) {
2251 case TCP_SEQ_STATE_OPENREQ:
2252 case TCP_SEQ_STATE_LISTENING:
2253 if (st->bucket >= INET_LHTABLE_SIZE)
2255 st->state = TCP_SEQ_STATE_LISTENING;
2256 rc = listening_get_next(seq, NULL);
2257 while (offset-- && rc)
2258 rc = listening_get_next(seq, rc);
2263 case TCP_SEQ_STATE_ESTABLISHED:
2264 case TCP_SEQ_STATE_TIME_WAIT:
2265 st->state = TCP_SEQ_STATE_ESTABLISHED;
2266 if (st->bucket > tcp_hashinfo.ehash_mask)
2268 rc = established_get_first(seq);
2269 while (offset-- && rc)
2270 rc = established_get_next(seq, rc);
2278 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2280 struct tcp_iter_state *st = seq->private;
2283 if (*pos && *pos == st->last_pos) {
2284 rc = tcp_seek_last_pos(seq);
2289 st->state = TCP_SEQ_STATE_LISTENING;
2293 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2296 st->last_pos = *pos;
2300 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2302 struct tcp_iter_state *st = seq->private;
2305 if (v == SEQ_START_TOKEN) {
2306 rc = tcp_get_idx(seq, 0);
2310 switch (st->state) {
2311 case TCP_SEQ_STATE_OPENREQ:
2312 case TCP_SEQ_STATE_LISTENING:
2313 rc = listening_get_next(seq, v);
2315 st->state = TCP_SEQ_STATE_ESTABLISHED;
2318 rc = established_get_first(seq);
2321 case TCP_SEQ_STATE_ESTABLISHED:
2322 case TCP_SEQ_STATE_TIME_WAIT:
2323 rc = established_get_next(seq, v);
2328 st->last_pos = *pos;
2332 static void tcp_seq_stop(struct seq_file *seq, void *v)
2334 struct tcp_iter_state *st = seq->private;
2336 switch (st->state) {
2337 case TCP_SEQ_STATE_OPENREQ:
2339 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2340 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2342 case TCP_SEQ_STATE_LISTENING:
2343 if (v != SEQ_START_TOKEN)
2344 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2346 case TCP_SEQ_STATE_TIME_WAIT:
2347 case TCP_SEQ_STATE_ESTABLISHED:
2349 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2354 int tcp_seq_open(struct inode *inode, struct file *file)
2356 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2357 struct tcp_iter_state *s;
2360 err = seq_open_net(inode, file, &afinfo->seq_ops,
2361 sizeof(struct tcp_iter_state));
2365 s = ((struct seq_file *)file->private_data)->private;
2366 s->family = afinfo->family;
2370 EXPORT_SYMBOL(tcp_seq_open);
2372 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2375 struct proc_dir_entry *p;
2377 afinfo->seq_ops.start = tcp_seq_start;
2378 afinfo->seq_ops.next = tcp_seq_next;
2379 afinfo->seq_ops.stop = tcp_seq_stop;
2381 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2382 afinfo->seq_fops, afinfo);
2387 EXPORT_SYMBOL(tcp_proc_register);
2389 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2391 proc_net_remove(net, afinfo->name);
2393 EXPORT_SYMBOL(tcp_proc_unregister);
2395 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2396 struct seq_file *f, int i, int uid, int *len)
2398 const struct inet_request_sock *ireq = inet_rsk(req);
2399 int ttd = req->expires - jiffies;
2401 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2402 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2405 ntohs(inet_sk(sk)->inet_sport),
2407 ntohs(ireq->rmt_port),
2409 0, 0, /* could print option size, but that is af dependent. */
2410 1, /* timers active (only the expire timer) */
2411 jiffies_to_clock_t(ttd),
2414 0, /* non standard timer */
2415 0, /* open_requests have no inode */
2416 atomic_read(&sk->sk_refcnt),
2421 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2424 unsigned long timer_expires;
2425 const struct tcp_sock *tp = tcp_sk(sk);
2426 const struct inet_connection_sock *icsk = inet_csk(sk);
2427 const struct inet_sock *inet = inet_sk(sk);
2428 __be32 dest = inet->inet_daddr;
2429 __be32 src = inet->inet_rcv_saddr;
2430 __u16 destp = ntohs(inet->inet_dport);
2431 __u16 srcp = ntohs(inet->inet_sport);
2434 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2436 timer_expires = icsk->icsk_timeout;
2437 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2439 timer_expires = icsk->icsk_timeout;
2440 } else if (timer_pending(&sk->sk_timer)) {
2442 timer_expires = sk->sk_timer.expires;
2445 timer_expires = jiffies;
2448 if (sk->sk_state == TCP_LISTEN)
2449 rx_queue = sk->sk_ack_backlog;
2452 * because we dont lock socket, we might find a transient negative value
2454 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2456 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2457 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2458 i, src, srcp, dest, destp, sk->sk_state,
2459 tp->write_seq - tp->snd_una,
2462 jiffies_to_clock_t(timer_expires - jiffies),
2463 icsk->icsk_retransmits,
2465 icsk->icsk_probes_out,
2467 atomic_read(&sk->sk_refcnt), sk,
2468 jiffies_to_clock_t(icsk->icsk_rto),
2469 jiffies_to_clock_t(icsk->icsk_ack.ato),
2470 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2472 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2476 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2477 struct seq_file *f, int i, int *len)
2481 int ttd = tw->tw_ttd - jiffies;
2486 dest = tw->tw_daddr;
2487 src = tw->tw_rcv_saddr;
2488 destp = ntohs(tw->tw_dport);
2489 srcp = ntohs(tw->tw_sport);
2491 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2492 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2493 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2494 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2495 atomic_read(&tw->tw_refcnt), tw, len);
2500 static int tcp4_seq_show(struct seq_file *seq, void *v)
2502 struct tcp_iter_state *st;
2505 if (v == SEQ_START_TOKEN) {
2506 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2507 " sl local_address rem_address st tx_queue "
2508 "rx_queue tr tm->when retrnsmt uid timeout "
2514 switch (st->state) {
2515 case TCP_SEQ_STATE_LISTENING:
2516 case TCP_SEQ_STATE_ESTABLISHED:
2517 get_tcp4_sock(v, seq, st->num, &len);
2519 case TCP_SEQ_STATE_OPENREQ:
2520 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2522 case TCP_SEQ_STATE_TIME_WAIT:
2523 get_timewait4_sock(v, seq, st->num, &len);
2526 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2531 static const struct file_operations tcp_afinfo_seq_fops = {
2532 .owner = THIS_MODULE,
2533 .open = tcp_seq_open,
2535 .llseek = seq_lseek,
2536 .release = seq_release_net
2539 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2542 .seq_fops = &tcp_afinfo_seq_fops,
2544 .show = tcp4_seq_show,
2548 static int __net_init tcp4_proc_init_net(struct net *net)
2550 return tcp_proc_register(net, &tcp4_seq_afinfo);
2553 static void __net_exit tcp4_proc_exit_net(struct net *net)
2555 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2558 static struct pernet_operations tcp4_net_ops = {
2559 .init = tcp4_proc_init_net,
2560 .exit = tcp4_proc_exit_net,
2563 int __init tcp4_proc_init(void)
2565 return register_pernet_subsys(&tcp4_net_ops);
2568 void tcp4_proc_exit(void)
2570 unregister_pernet_subsys(&tcp4_net_ops);
2572 #endif /* CONFIG_PROC_FS */
2574 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2576 const struct iphdr *iph = skb_gro_network_header(skb);
2578 switch (skb->ip_summed) {
2579 case CHECKSUM_COMPLETE:
2580 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2582 skb->ip_summed = CHECKSUM_UNNECESSARY;
2588 NAPI_GRO_CB(skb)->flush = 1;
2592 return tcp_gro_receive(head, skb);
2595 int tcp4_gro_complete(struct sk_buff *skb)
2597 const struct iphdr *iph = ip_hdr(skb);
2598 struct tcphdr *th = tcp_hdr(skb);
2600 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2601 iph->saddr, iph->daddr, 0);
2602 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2604 return tcp_gro_complete(skb);
2607 struct proto tcp_prot = {
2609 .owner = THIS_MODULE,
2611 .connect = tcp_v4_connect,
2612 .disconnect = tcp_disconnect,
2613 .accept = inet_csk_accept,
2615 .init = tcp_v4_init_sock,
2616 .destroy = tcp_v4_destroy_sock,
2617 .shutdown = tcp_shutdown,
2618 .setsockopt = tcp_setsockopt,
2619 .getsockopt = tcp_getsockopt,
2620 .recvmsg = tcp_recvmsg,
2621 .sendmsg = tcp_sendmsg,
2622 .sendpage = tcp_sendpage,
2623 .backlog_rcv = tcp_v4_do_rcv,
2624 .release_cb = tcp_release_cb,
2625 .mtu_reduced = tcp_v4_mtu_reduced,
2627 .unhash = inet_unhash,
2628 .get_port = inet_csk_get_port,
2629 .enter_memory_pressure = tcp_enter_memory_pressure,
2630 .sockets_allocated = &tcp_sockets_allocated,
2631 .orphan_count = &tcp_orphan_count,
2632 .memory_allocated = &tcp_memory_allocated,
2633 .memory_pressure = &tcp_memory_pressure,
2634 .sysctl_wmem = sysctl_tcp_wmem,
2635 .sysctl_rmem = sysctl_tcp_rmem,
2636 .max_header = MAX_TCP_HEADER,
2637 .obj_size = sizeof(struct tcp_sock),
2638 .slab_flags = SLAB_DESTROY_BY_RCU,
2639 .twsk_prot = &tcp_timewait_sock_ops,
2640 .rsk_prot = &tcp_request_sock_ops,
2641 .h.hashinfo = &tcp_hashinfo,
2642 .no_autobind = true,
2643 #ifdef CONFIG_COMPAT
2644 .compat_setsockopt = compat_tcp_setsockopt,
2645 .compat_getsockopt = compat_tcp_getsockopt,
2647 #ifdef CONFIG_MEMCG_KMEM
2648 .init_cgroup = tcp_init_cgroup,
2649 .destroy_cgroup = tcp_destroy_cgroup,
2650 .proto_cgroup = tcp_proto_cgroup,
2653 EXPORT_SYMBOL(tcp_prot);
2655 static int __net_init tcp_sk_init(struct net *net)
2660 static void __net_exit tcp_sk_exit(struct net *net)
2664 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2666 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2669 static struct pernet_operations __net_initdata tcp_sk_ops = {
2670 .init = tcp_sk_init,
2671 .exit = tcp_sk_exit,
2672 .exit_batch = tcp_sk_exit_batch,
2675 void __init tcp_v4_init(void)
2677 inet_hashinfo_init(&tcp_hashinfo);
2678 if (register_pernet_subsys(&tcp_sk_ops))
2679 panic("Failed to create the TCP control socket.\n");