net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_tw_reuse __read_mostly;
  88 int sysctl_tcp_low_latency __read_mostly;
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
  99 {
 100         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 101                                           ip_hdr(skb)->saddr,
 102                                           tcp_hdr(skb)->dest,
 103                                           tcp_hdr(skb)->source, tsoff);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 109         struct tcp_sock *tp = tcp_sk(sk);
 110
 111         /* With PAWS, it is safe from the viewpoint
 112            of data integrity. Even without PAWS it is safe provided sequence
 113            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 114
 115            Actually, the idea is close to VJ's one, only timestamp cache is
 116            held not per host, but per port pair and TW bucket is used as state
 117            holder.
 118
 119            If TW bucket has been already destroyed we fall back to VJ's scheme
 120            and use initial timestamp retrieved from peer table.
 121          */
 122         if (tcptw->tw_ts_recent_stamp &&
 123             (!twp || (sysctl_tcp_tw_reuse &&
 124                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 125                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 126                 if (tp->write_seq == 0)
 127                         tp->write_seq = 1;
 128                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 129                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 130                 sock_hold(sktw);
 131                 return 1;
 132         }
 133
 134         return 0;
 135 }
 136 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 137
 138 /* This will initiate an outgoing connection. */
 139 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 140 {
 141         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 142         struct inet_sock *inet = inet_sk(sk);
 143         struct tcp_sock *tp = tcp_sk(sk);
 144         __be16 orig_sport, orig_dport;
 145         __be32 daddr, nexthop;
 146         struct flowi4 *fl4;
 147         struct rtable *rt;
 148         int err;
 149         struct ip_options_rcu *inet_opt;
 150
 151         if (addr_len < sizeof(struct sockaddr_in))
 152                 return -EINVAL;
 153
 154         if (usin->sin_family != AF_INET)
 155                 return -EAFNOSUPPORT;
 156
 157         nexthop = daddr = usin->sin_addr.s_addr;
 158         inet_opt = rcu_dereference_protected(inet->inet_opt,
 159                                              lockdep_sock_is_held(sk));
 160         if (inet_opt && inet_opt->opt.srr) {
 161                 if (!daddr)
 162                         return -EINVAL;
 163                 nexthop = inet_opt->opt.faddr;
 164         }
 165
 166         orig_sport = inet->inet_sport;
 167         orig_dport = usin->sin_port;
 168         fl4 = &inet->cork.fl.u.ip4;
 169         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 170                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                               IPPROTO_TCP,
 172                               orig_sport, orig_dport, sk);
 173         if (IS_ERR(rt)) {
 174                 err = PTR_ERR(rt);
 175                 if (err == -ENETUNREACH)
 176                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 177                 return err;
 178         }
 179
 180         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 181                 ip_rt_put(rt);
 182                 return -ENETUNREACH;
 183         }
 184
 185         if (!inet_opt || !inet_opt->opt.srr)
 186                 daddr = fl4->daddr;
 187
 188         if (!inet->inet_saddr)
 189                 inet->inet_saddr = fl4->saddr;
 190         sk_rcv_saddr_set(sk, inet->inet_saddr);
 191
 192         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 193                 /* Reset inherited state */
 194                 tp->rx_opt.ts_recent       = 0;
 195                 tp->rx_opt.ts_recent_stamp = 0;
 196                 if (likely(!tp->repair))
 197                         tp->write_seq      = 0;
 198         }
 199
 200         if (tcp_death_row.sysctl_tw_recycle &&
 201             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 202                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 203
 204         inet->inet_dport = usin->sin_port;
 205         sk_daddr_set(sk, daddr);
 206
 207         inet_csk(sk)->icsk_ext_hdr_len = 0;
 208         if (inet_opt)
 209                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 210
 211         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 212
 213         /* Socket identity is still unknown (sport may be zero).
 214          * However we set state to SYN-SENT and not releasing socket
 215          * lock select source port, enter ourselves into the hash tables and
 216          * complete initialization after this.
 217          */
 218         tcp_set_state(sk, TCP_SYN_SENT);
 219         err = inet_hash_connect(&tcp_death_row, sk);
 220         if (err)
 221                 goto failure;
 222
 223         sk_set_txhash(sk);
 224
 225         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 226                                inet->inet_sport, inet->inet_dport, sk);
 227         if (IS_ERR(rt)) {
 228                 err = PTR_ERR(rt);
 229                 rt = NULL;
 230                 goto failure;
 231         }
 232         /* OK, now commit destination to socket.  */
 233         sk->sk_gso_type = SKB_GSO_TCPV4;
 234         sk_setup_caps(sk, &rt->dst);
 235
 236         if (!tp->write_seq && likely(!tp->repair))
 237                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 238                                                            inet->inet_daddr,
 239                                                            inet->inet_sport,
 240                                                            usin->sin_port,
 241                                                            &tp->tsoffset);
 242
 243         inet->inet_id = tp->write_seq ^ jiffies;
 244
 245         err = tcp_connect(sk);
 246
 247         rt = NULL;
 248         if (err)
 249                 goto failure;
 250
 251         return 0;
 252
 253 failure:
 254         /*
 255          * This unhashes the socket and releases the local port,
 256          * if necessary.
 257          */
 258         tcp_set_state(sk, TCP_CLOSE);
 259         ip_rt_put(rt);
 260         sk->sk_route_caps = 0;
 261         inet->inet_dport = 0;
 262         return err;
 263 }
 264 EXPORT_SYMBOL(tcp_v4_connect);
 265
 266 /*
 267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268  * It can be called through tcp_release_cb() if socket was owned by user
 269  * at the time tcp_v4_err() was called to handle ICMP message.
 270  */
 271 void tcp_v4_mtu_reduced(struct sock *sk)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275         u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277         dst = inet_csk_update_pmtu(sk, mtu);
 278         if (!dst)
 279                 return;
 280
 281         /* Something is about to be wrong... Remember soft error
 282          * for the case, if this connection will not able to recover.
 283          */
 284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285                 sk->sk_err_soft = EMSGSIZE;
 286
 287         mtu = dst_mtu(dst);
 288
 289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290             ip_sk_accept_pmtu(sk) &&
 291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 292                 tcp_sync_mss(sk, mtu);
 293
 294                 /* Resend the TCP packet because it's
 295                  * clear that the old packet has been
 296                  * dropped. This is the new "fast" path mtu
 297                  * discovery.
 298                  */
 299                 tcp_simple_retransmit(sk);
 300         } /* else let the usual retransmit timer handle it */
 301 }
 302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 303
 304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 305 {
 306         struct dst_entry *dst = __sk_dst_check(sk, 0);
 307
 308         if (dst)
 309                 dst->ops->redirect(dst, sk, skb);
 310 }
 311
 312
 313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 315 {
 316         struct request_sock *req = inet_reqsk(sk);
 317         struct net *net = sock_net(sk);
 318
 319         /* ICMPs are not backlogged, hence we cannot get
 320          * an established socket here.
 321          */
 322         if (seq != tcp_rsk(req)->snt_isn) {
 323                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 324         } else if (abort) {
 325                 /*
 326                  * Still in SYN_RECV, just remove it silently.
 327                  * There is no good way to pass the error to the newly
 328                  * created socket, and POSIX does not want network
 329                  * errors returned from accept().
 330                  */
 331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 332                 tcp_listendrop(req->rsk_listener);
 333         }
 334         reqsk_put(req);
 335 }
 336 EXPORT_SYMBOL(tcp_req_err);
 337
 338 /*
 339  * This routine is called by the ICMP module when it gets some
 340  * sort of error condition.  If err < 0 then the socket should
 341  * be closed and the error returned to the user.  If err > 0
 342  * it's just the icmp type << 8 | icmp code.  After adjustment
 343  * header points to the first 8 bytes of the tcp header.  We need
 344  * to find the appropriate port.
 345  *
 346  * The locking strategy used here is very "optimistic". When
 347  * someone else accesses the socket the ICMP is just dropped
 348  * and for some paths there is no check at all.
 349  * A more general error queue to queue errors for later handling
 350  * is probably better.
 351  *
 352  */
 353
 354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 355 {
 356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 358         struct inet_connection_sock *icsk;
 359         struct tcp_sock *tp;
 360         struct inet_sock *inet;
 361         const int type = icmp_hdr(icmp_skb)->type;
 362         const int code = icmp_hdr(icmp_skb)->code;
 363         struct sock *sk;
 364         struct sk_buff *skb;
 365         struct request_sock *fastopen;
 366         __u32 seq, snd_una;
 367         __u32 remaining;
 368         int err;
 369         struct net *net = dev_net(icmp_skb->dev);
 370
 371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 372                                        th->dest, iph->saddr, ntohs(th->source),
 373                                        inet_iif(icmp_skb));
 374         if (!sk) {
 375                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 376                 return;
 377         }
 378         if (sk->sk_state == TCP_TIME_WAIT) {
 379                 inet_twsk_put(inet_twsk(sk));
 380                 return;
 381         }
 382         seq = ntohl(th->seq);
 383         if (sk->sk_state == TCP_NEW_SYN_RECV)
 384                 return tcp_req_err(sk, seq,
 385                                   type == ICMP_PARAMETERPROB ||
 386                                   type == ICMP_TIME_EXCEEDED ||
 387                                   (type == ICMP_DEST_UNREACH &&
 388                                    (code == ICMP_NET_UNREACH ||
 389                                     code == ICMP_HOST_UNREACH)));
 390
 391         bh_lock_sock(sk);
 392         /* If too many ICMPs get dropped on busy
 393          * servers this needs to be solved differently.
 394          * We do take care of PMTU discovery (RFC1191) special case :
 395          * we can receive locally generated ICMP messages while socket is held.
 396          */
 397         if (sock_owned_by_user(sk)) {
 398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 399                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 400         }
 401         if (sk->sk_state == TCP_CLOSE)
 402                 goto out;
 403
 404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 405                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 406                 goto out;
 407         }
 408
 409         icsk = inet_csk(sk);
 410         tp = tcp_sk(sk);
 411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 412         fastopen = tp->fastopen_rsk;
 413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 414         if (sk->sk_state != TCP_LISTEN &&
 415             !between(seq, snd_una, tp->snd_nxt)) {
 416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 417                 goto out;
 418         }
 419
 420         switch (type) {
 421         case ICMP_REDIRECT:
 422                 do_redirect(icmp_skb, sk);
 423                 goto out;
 424         case ICMP_SOURCE_QUENCH:
 425                 /* Just silently ignore these. */
 426                 goto out;
 427         case ICMP_PARAMETERPROB:
 428                 err = EPROTO;
 429                 break;
 430         case ICMP_DEST_UNREACH:
 431                 if (code > NR_ICMP_UNREACH)
 432                         goto out;
 433
 434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 435                         /* We are not interested in TCP_LISTEN and open_requests
 436                          * (SYN-ACKs send out by Linux are always <576bytes so
 437                          * they should go through unfragmented).
 438                          */
 439                         if (sk->sk_state == TCP_LISTEN)
 440                                 goto out;
 441
 442                         tp->mtu_info = info;
 443                         if (!sock_owned_by_user(sk)) {
 444                                 tcp_v4_mtu_reduced(sk);
 445                         } else {
 446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 447                                         sock_hold(sk);
 448                         }
 449                         goto out;
 450                 }
 451
 452                 err = icmp_err_convert[code].errno;
 453                 /* check if icmp_skb allows revert of backoff
 454                  * (see draft-zimmermann-tcp-lcd) */
 455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 456                         break;
 457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 458                     !icsk->icsk_backoff || fastopen)
 459                         break;
 460
 461                 if (sock_owned_by_user(sk))
 462                         break;
 463
 464                 icsk->icsk_backoff--;
 465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 466                                                TCP_TIMEOUT_INIT;
 467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 468
 469                 skb = tcp_write_queue_head(sk);
 470                 BUG_ON(!skb);
 471
 472                 remaining = icsk->icsk_rto -
 473                             min(icsk->icsk_rto,
 474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 475
 476                 if (remaining) {
 477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 478                                                   remaining, TCP_RTO_MAX);
 479                 } else {
 480                         /* RTO revert clocked out retransmission.
 481                          * Will retransmit now */
 482                         tcp_retransmit_timer(sk);
 483                 }
 484
 485                 break;
 486         case ICMP_TIME_EXCEEDED:
 487                 err = EHOSTUNREACH;
 488                 break;
 489         default:
 490                 goto out;
 491         }
 492
 493         switch (sk->sk_state) {
 494         case TCP_SYN_SENT:
 495         case TCP_SYN_RECV:
 496                 /* Only in fast or simultaneous open. If a fast open socket is
 497                  * is already accepted it is treated as a connected one below.
 498                  */
 499                 if (fastopen && !fastopen->sk)
 500                         break;
 501
 502                 if (!sock_owned_by_user(sk)) {
 503                         sk->sk_err = err;
 504
 505                         sk->sk_error_report(sk);
 506
 507                         tcp_done(sk);
 508                 } else {
 509                         sk->sk_err_soft = err;
 510                 }
 511                 goto out;
 512         }
 513
 514         /* If we've already connected we will keep trying
 515          * until we time out, or the user gives up.
 516          *
 517          * rfc1122 4.2.3.9 allows to consider as hard errors
 518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 519          * but it is obsoleted by pmtu discovery).
 520          *
 521          * Note, that in modern internet, where routing is unreliable
 522          * and in each dark corner broken firewalls sit, sending random
 523          * errors ordered by their masters even this two messages finally lose
 524          * their original sense (even Linux sends invalid PORT_UNREACHs)
 525          *
 526          * Now we are in compliance with RFCs.
 527          *                                                      --ANK (980905)
 528          */
 529
 530         inet = inet_sk(sk);
 531         if (!sock_owned_by_user(sk) && inet->recverr) {
 532                 sk->sk_err = err;
 533                 sk->sk_error_report(sk);
 534         } else  { /* Only an error on timeout */
 535                 sk->sk_err_soft = err;
 536         }
 537
 538 out:
 539         bh_unlock_sock(sk);
 540         sock_put(sk);
 541 }
 542
 543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 544 {
 545         struct tcphdr *th = tcp_hdr(skb);
 546
 547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 549                 skb->csum_start = skb_transport_header(skb) - skb->head;
 550                 skb->csum_offset = offsetof(struct tcphdr, check);
 551         } else {
 552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 553                                          csum_partial(th,
 554                                                       th->doff << 2,
 555                                                       skb->csum));
 556         }
 557 }
 558
 559 /* This routine computes an IPv4 TCP checksum. */
 560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 561 {
 562         const struct inet_sock *inet = inet_sk(sk);
 563
 564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 565 }
 566 EXPORT_SYMBOL(tcp_v4_send_check);
 567
 568 /*
 569  *      This routine will send an RST to the other tcp.
 570  *
 571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 572  *                    for reset.
 573  *      Answer: if a packet caused RST, it is not for a socket
 574  *              existing in our system, if it is matched to a socket,
 575  *              it is just duplicate segment or bug in other side's TCP.
 576  *              So that we build reply only basing on parameters
 577  *              arrived with segment.
 578  *      Exception: precedence violation. We do not implement it in any case.
 579  */
 580
 581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 582 {
 583         const struct tcphdr *th = tcp_hdr(skb);
 584         struct {
 585                 struct tcphdr th;
 586 #ifdef CONFIG_TCP_MD5SIG
 587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 588 #endif
 589         } rep;
 590         struct ip_reply_arg arg;
 591 #ifdef CONFIG_TCP_MD5SIG
 592         struct tcp_md5sig_key *key = NULL;
 593         const __u8 *hash_location = NULL;
 594         unsigned char newhash[16];
 595         int genhash;
 596         struct sock *sk1 = NULL;
 597 #endif
 598         struct net *net;
 599
 600         /* Never send a reset in response to a reset. */
 601         if (th->rst)
 602                 return;
 603
 604         /* If sk not NULL, it means we did a successful lookup and incoming
 605          * route had to be correct. prequeue might have dropped our dst.
 606          */
 607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 608                 return;
 609
 610         /* Swap the send and the receive. */
 611         memset(&rep, 0, sizeof(rep));
 612         rep.th.dest   = th->source;
 613         rep.th.source = th->dest;
 614         rep.th.doff   = sizeof(struct tcphdr) / 4;
 615         rep.th.rst    = 1;
 616
 617         if (th->ack) {
 618                 rep.th.seq = th->ack_seq;
 619         } else {
 620                 rep.th.ack = 1;
 621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 622                                        skb->len - (th->doff << 2));
 623         }
 624
 625         memset(&arg, 0, sizeof(arg));
 626         arg.iov[0].iov_base = (unsigned char *)&rep;
 627         arg.iov[0].iov_len  = sizeof(rep.th);
 628
 629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 630 #ifdef CONFIG_TCP_MD5SIG
 631         rcu_read_lock();
 632         hash_location = tcp_parse_md5sig_option(th);
 633         if (sk && sk_fullsock(sk)) {
 634                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 635                                         &ip_hdr(skb)->saddr, AF_INET);
 636         } else if (hash_location) {
 637                 /*
 638                  * active side is lost. Try to find listening socket through
 639                  * source port, and then find md5 key through listening socket.
 640                  * we are not loose security here:
 641                  * Incoming packet is checked with md5 hash with finding key,
 642                  * no RST generated if md5 hash doesn't match.
 643                  */
 644                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 645                                              ip_hdr(skb)->saddr,
 646                                              th->source, ip_hdr(skb)->daddr,
 647                                              ntohs(th->source), inet_iif(skb));
 648                 /* don't send rst if it can't find key */
 649                 if (!sk1)
 650                         goto out;
 651
 652                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 653                                         &ip_hdr(skb)->saddr, AF_INET);
 654                 if (!key)
 655                         goto out;
 656
 657
 658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660                         goto out;
 661
 662         }
 663
 664         if (key) {
 665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 666                                    (TCPOPT_NOP << 16) |
 667                                    (TCPOPT_MD5SIG << 8) |
 668                                    TCPOLEN_MD5SIG);
 669                 /* Update length and the length the header thinks exists */
 670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 671                 rep.th.doff = arg.iov[0].iov_len / 4;
 672
 673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 674                                      key, ip_hdr(skb)->saddr,
 675                                      ip_hdr(skb)->daddr, &rep.th);
 676         }
 677 #endif
 678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 679                                       ip_hdr(skb)->saddr, /* XXX */
 680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 682         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 683
 684         /* When socket is gone, all binding information is lost.
 685          * routing might fail in this case. No choice here, if we choose to force
 686          * input interface, we will misroute in case of asymmetric route.
 687          */
 688         if (sk)
 689                 arg.bound_dev_if = sk->sk_bound_dev_if;
 690
 691         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 692                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 693
 694         arg.tos = ip_hdr(skb)->tos;
 695         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 696         local_bh_disable();
 697         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 698                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 699                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 700                               &arg, arg.iov[0].iov_len);
 701
 702         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 703         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 704         local_bh_enable();
 705
 706 #ifdef CONFIG_TCP_MD5SIG
 707 out:
 708         rcu_read_unlock();
 709 #endif
 710 }
 711
 712 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 713    outside socket context is ugly, certainly. What can I do?
 714  */
 715
 716 static void tcp_v4_send_ack(const struct sock *sk,
 717                             struct sk_buff *skb, u32 seq, u32 ack,
 718                             u32 win, u32 tsval, u32 tsecr, int oif,
 719                             struct tcp_md5sig_key *key,
 720                             int reply_flags, u8 tos)
 721 {
 722         const struct tcphdr *th = tcp_hdr(skb);
 723         struct {
 724                 struct tcphdr th;
 725                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 726 #ifdef CONFIG_TCP_MD5SIG
 727                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 728 #endif
 729                         ];
 730         } rep;
 731         struct net *net = sock_net(sk);
 732         struct ip_reply_arg arg;
 733
 734         memset(&rep.th, 0, sizeof(struct tcphdr));
 735         memset(&arg, 0, sizeof(arg));
 736
 737         arg.iov[0].iov_base = (unsigned char *)&rep;
 738         arg.iov[0].iov_len  = sizeof(rep.th);
 739         if (tsecr) {
 740                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 741                                    (TCPOPT_TIMESTAMP << 8) |
 742                                    TCPOLEN_TIMESTAMP);
 743                 rep.opt[1] = htonl(tsval);
 744                 rep.opt[2] = htonl(tsecr);
 745                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 746         }
 747
 748         /* Swap the send and the receive. */
 749         rep.th.dest    = th->source;
 750         rep.th.source  = th->dest;
 751         rep.th.doff    = arg.iov[0].iov_len / 4;
 752         rep.th.seq     = htonl(seq);
 753         rep.th.ack_seq = htonl(ack);
 754         rep.th.ack     = 1;
 755         rep.th.window  = htons(win);
 756
 757 #ifdef CONFIG_TCP_MD5SIG
 758         if (key) {
 759                 int offset = (tsecr) ? 3 : 0;
 760
 761                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 762                                           (TCPOPT_NOP << 16) |
 763                                           (TCPOPT_MD5SIG << 8) |
 764                                           TCPOLEN_MD5SIG);
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len/4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 769                                     key, ip_hdr(skb)->saddr,
 770                                     ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.flags = reply_flags;
 774         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 775                                       ip_hdr(skb)->saddr, /* XXX */
 776                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 777         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 778         if (oif)
 779                 arg.bound_dev_if = oif;
 780         arg.tos = tos;
 781         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 782         local_bh_disable();
 783         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 784                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 785                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 786                               &arg, arg.iov[0].iov_len);
 787
 788         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 789         local_bh_enable();
 790 }
 791
 792 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 793 {
 794         struct inet_timewait_sock *tw = inet_twsk(sk);
 795         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 796
 797         tcp_v4_send_ack(sk, skb,
 798                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 799                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 800                         tcp_time_stamp + tcptw->tw_ts_offset,
 801                         tcptw->tw_ts_recent,
 802                         tw->tw_bound_dev_if,
 803                         tcp_twsk_md5_key(tcptw),
 804                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 805                         tw->tw_tos
 806                         );
 807
 808         inet_twsk_put(tw);
 809 }
 810
 811 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 812                                   struct request_sock *req)
 813 {
 814         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 815          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 816          */
 817         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 818                                              tcp_sk(sk)->snd_nxt;
 819
 820         /* RFC 7323 2.3
 821          * The window field (SEG.WND) of every outgoing segment, with the
 822          * exception of <SYN> segments, MUST be right-shifted by
 823          * Rcv.Wind.Shift bits:
 824          */
 825         tcp_v4_send_ack(sk, skb, seq,
 826                         tcp_rsk(req)->rcv_nxt,
 827                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 828                         tcp_time_stamp + tcp_rsk(req)->ts_off,
 829                         req->ts_recent,
 830                         0,
 831                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 832                                           AF_INET),
 833                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 834                         ip_hdr(skb)->tos);
 835 }
 836
 837 /*
 838  *      Send a SYN-ACK after having received a SYN.
 839  *      This still operates on a request_sock only, not on a big
 840  *      socket.
 841  */
 842 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 843                               struct flowi *fl,
 844                               struct request_sock *req,
 845                               struct tcp_fastopen_cookie *foc,
 846                               enum tcp_synack_type synack_type)
 847 {
 848         const struct inet_request_sock *ireq = inet_rsk(req);
 849         struct flowi4 fl4;
 850         int err = -1;
 851         struct sk_buff *skb;
 852
 853         /* First, grab a route. */
 854         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 855                 return -1;
 856
 857         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 858
 859         if (skb) {
 860                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 861
 862                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 863                                             ireq->ir_rmt_addr,
 864                                             ireq->opt);
 865                 err = net_xmit_eval(err);
 866         }
 867
 868         return err;
 869 }
 870
 871 /*
 872  *      IPv4 request_sock destructor.
 873  */
 874 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 875 {
 876         kfree(inet_rsk(req)->opt);
 877 }
 878
 879 #ifdef CONFIG_TCP_MD5SIG
 880 /*
 881  * RFC2385 MD5 checksumming requires a mapping of
 882  * IP address->MD5 Key.
 883  * We need to maintain these in the sk structure.
 884  */
 885
 886 /* Find the Key structure for an address.  */
 887 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 888                                          const union tcp_md5_addr *addr,
 889                                          int family)
 890 {
 891         const struct tcp_sock *tp = tcp_sk(sk);
 892         struct tcp_md5sig_key *key;
 893         unsigned int size = sizeof(struct in_addr);
 894         const struct tcp_md5sig_info *md5sig;
 895
 896         /* caller either holds rcu_read_lock() or socket lock */
 897         md5sig = rcu_dereference_check(tp->md5sig_info,
 898                                        lockdep_sock_is_held(sk));
 899         if (!md5sig)
 900                 return NULL;
 901 #if IS_ENABLED(CONFIG_IPV6)
 902         if (family == AF_INET6)
 903                 size = sizeof(struct in6_addr);
 904 #endif
 905         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 906                 if (key->family != family)
 907                         continue;
 908                 if (!memcmp(&key->addr, addr, size))
 909                         return key;
 910         }
 911         return NULL;
 912 }
 913 EXPORT_SYMBOL(tcp_md5_do_lookup);
 914
 915 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 916                                          const struct sock *addr_sk)
 917 {
 918         const union tcp_md5_addr *addr;
 919
 920         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 921         return tcp_md5_do_lookup(sk, addr, AF_INET);
 922 }
 923 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 924
 925 /* This can be called on a newly created socket, from other files */
 926 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 927                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 928 {
 929         /* Add Key to the list */
 930         struct tcp_md5sig_key *key;
 931         struct tcp_sock *tp = tcp_sk(sk);
 932         struct tcp_md5sig_info *md5sig;
 933
 934         key = tcp_md5_do_lookup(sk, addr, family);
 935         if (key) {
 936                 /* Pre-existing entry - just update that one. */
 937                 memcpy(key->key, newkey, newkeylen);
 938                 key->keylen = newkeylen;
 939                 return 0;
 940         }
 941
 942         md5sig = rcu_dereference_protected(tp->md5sig_info,
 943                                            lockdep_sock_is_held(sk));
 944         if (!md5sig) {
 945                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 946                 if (!md5sig)
 947                         return -ENOMEM;
 948
 949                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 950                 INIT_HLIST_HEAD(&md5sig->head);
 951                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 952         }
 953
 954         key = sock_kmalloc(sk, sizeof(*key), gfp);
 955         if (!key)
 956                 return -ENOMEM;
 957         if (!tcp_alloc_md5sig_pool()) {
 958                 sock_kfree_s(sk, key, sizeof(*key));
 959                 return -ENOMEM;
 960         }
 961
 962         memcpy(key->key, newkey, newkeylen);
 963         key->keylen = newkeylen;
 964         key->family = family;
 965         memcpy(&key->addr, addr,
 966                (family == AF_INET6) ? sizeof(struct in6_addr) :
 967                                       sizeof(struct in_addr));
 968         hlist_add_head_rcu(&key->node, &md5sig->head);
 969         return 0;
 970 }
 971 EXPORT_SYMBOL(tcp_md5_do_add);
 972
 973 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 974 {
 975         struct tcp_md5sig_key *key;
 976
 977         key = tcp_md5_do_lookup(sk, addr, family);
 978         if (!key)
 979                 return -ENOENT;
 980         hlist_del_rcu(&key->node);
 981         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 982         kfree_rcu(key, rcu);
 983         return 0;
 984 }
 985 EXPORT_SYMBOL(tcp_md5_do_del);
 986
 987 static void tcp_clear_md5_list(struct sock *sk)
 988 {
 989         struct tcp_sock *tp = tcp_sk(sk);
 990         struct tcp_md5sig_key *key;
 991         struct hlist_node *n;
 992         struct tcp_md5sig_info *md5sig;
 993
 994         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 995
 996         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 997                 hlist_del_rcu(&key->node);
 998                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 999                 kfree_rcu(key, rcu);
1000         }
1001 }
1002
1003 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1004                                  int optlen)
1005 {
1006         struct tcp_md5sig cmd;
1007         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1008
1009         if (optlen < sizeof(cmd))
1010                 return -EINVAL;
1011
1012         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1013                 return -EFAULT;
1014
1015         if (sin->sin_family != AF_INET)
1016                 return -EINVAL;
1017
1018         if (!cmd.tcpm_keylen)
1019                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1020                                       AF_INET);
1021
1022         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1023                 return -EINVAL;
1024
1025         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1026                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1027                               GFP_KERNEL);
1028 }
1029
1030 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1031                                    __be32 daddr, __be32 saddr,
1032                                    const struct tcphdr *th, int nbytes)
1033 {
1034         struct tcp4_pseudohdr *bp;
1035         struct scatterlist sg;
1036         struct tcphdr *_th;
1037
1038         bp = hp->scratch;
1039         bp->saddr = saddr;
1040         bp->daddr = daddr;
1041         bp->pad = 0;
1042         bp->protocol = IPPROTO_TCP;
1043         bp->len = cpu_to_be16(nbytes);
1044
1045         _th = (struct tcphdr *)(bp + 1);
1046         memcpy(_th, th, sizeof(*th));
1047         _th->check = 0;
1048
1049         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1050         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1051                                 sizeof(*bp) + sizeof(*th));
1052         return crypto_ahash_update(hp->md5_req);
1053 }
1054
1055 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1056                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1057 {
1058         struct tcp_md5sig_pool *hp;
1059         struct ahash_request *req;
1060
1061         hp = tcp_get_md5sig_pool();
1062         if (!hp)
1063                 goto clear_hash_noput;
1064         req = hp->md5_req;
1065
1066         if (crypto_ahash_init(req))
1067                 goto clear_hash;
1068         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1069                 goto clear_hash;
1070         if (tcp_md5_hash_key(hp, key))
1071                 goto clear_hash;
1072         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1073         if (crypto_ahash_final(req))
1074                 goto clear_hash;
1075
1076         tcp_put_md5sig_pool();
1077         return 0;
1078
1079 clear_hash:
1080         tcp_put_md5sig_pool();
1081 clear_hash_noput:
1082         memset(md5_hash, 0, 16);
1083         return 1;
1084 }
1085
1086 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1087                         const struct sock *sk,
1088                         const struct sk_buff *skb)
1089 {
1090         struct tcp_md5sig_pool *hp;
1091         struct ahash_request *req;
1092         const struct tcphdr *th = tcp_hdr(skb);
1093         __be32 saddr, daddr;
1094
1095         if (sk) { /* valid for establish/request sockets */
1096                 saddr = sk->sk_rcv_saddr;
1097                 daddr = sk->sk_daddr;
1098         } else {
1099                 const struct iphdr *iph = ip_hdr(skb);
1100                 saddr = iph->saddr;
1101                 daddr = iph->daddr;
1102         }
1103
1104         hp = tcp_get_md5sig_pool();
1105         if (!hp)
1106                 goto clear_hash_noput;
1107         req = hp->md5_req;
1108
1109         if (crypto_ahash_init(req))
1110                 goto clear_hash;
1111
1112         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1113                 goto clear_hash;
1114         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1115                 goto clear_hash;
1116         if (tcp_md5_hash_key(hp, key))
1117                 goto clear_hash;
1118         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1119         if (crypto_ahash_final(req))
1120                 goto clear_hash;
1121
1122         tcp_put_md5sig_pool();
1123         return 0;
1124
1125 clear_hash:
1126         tcp_put_md5sig_pool();
1127 clear_hash_noput:
1128         memset(md5_hash, 0, 16);
1129         return 1;
1130 }
1131 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1132
1133 #endif
1134
1135 /* Called with rcu_read_lock() */
1136 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1137                                     const struct sk_buff *skb)
1138 {
1139 #ifdef CONFIG_TCP_MD5SIG
1140         /*
1141          * This gets called for each TCP segment that arrives
1142          * so we want to be efficient.
1143          * We have 3 drop cases:
1144          * o No MD5 hash and one expected.
1145          * o MD5 hash and we're not expecting one.
1146          * o MD5 hash and its wrong.
1147          */
1148         const __u8 *hash_location = NULL;
1149         struct tcp_md5sig_key *hash_expected;
1150         const struct iphdr *iph = ip_hdr(skb);
1151         const struct tcphdr *th = tcp_hdr(skb);
1152         int genhash;
1153         unsigned char newhash[16];
1154
1155         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1156                                           AF_INET);
1157         hash_location = tcp_parse_md5sig_option(th);
1158
1159         /* We've parsed the options - do we have a hash? */
1160         if (!hash_expected && !hash_location)
1161                 return false;
1162
1163         if (hash_expected && !hash_location) {
1164                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1165                 return true;
1166         }
1167
1168         if (!hash_expected && hash_location) {
1169                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1170                 return true;
1171         }
1172
1173         /* Okay, so this is hash_expected and hash_location -
1174          * so we need to calculate the checksum.
1175          */
1176         genhash = tcp_v4_md5_hash_skb(newhash,
1177                                       hash_expected,
1178                                       NULL, skb);
1179
1180         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1181                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1182                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1183                                      &iph->saddr, ntohs(th->source),
1184                                      &iph->daddr, ntohs(th->dest),
1185                                      genhash ? " tcp_v4_calc_md5_hash failed"
1186                                      : "");
1187                 return true;
1188         }
1189         return false;
1190 #endif
1191         return false;
1192 }
1193
1194 static void tcp_v4_init_req(struct request_sock *req,
1195                             const struct sock *sk_listener,
1196                             struct sk_buff *skb)
1197 {
1198         struct inet_request_sock *ireq = inet_rsk(req);
1199
1200         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1201         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1202         ireq->opt = tcp_v4_save_options(skb);
1203 }
1204
1205 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1206                                           struct flowi *fl,
1207                                           const struct request_sock *req,
1208                                           bool *strict)
1209 {
1210         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1211
1212         if (strict) {
1213                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1214                         *strict = true;
1215                 else
1216                         *strict = false;
1217         }
1218
1219         return dst;
1220 }
1221
1222 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1223         .family         =       PF_INET,
1224         .obj_size       =       sizeof(struct tcp_request_sock),
1225         .rtx_syn_ack    =       tcp_rtx_synack,
1226         .send_ack       =       tcp_v4_reqsk_send_ack,
1227         .destructor     =       tcp_v4_reqsk_destructor,
1228         .send_reset     =       tcp_v4_send_reset,
1229         .syn_ack_timeout =      tcp_syn_ack_timeout,
1230 };
1231
1232 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1233         .mss_clamp      =       TCP_MSS_DEFAULT,
1234 #ifdef CONFIG_TCP_MD5SIG
1235         .req_md5_lookup =       tcp_v4_md5_lookup,
1236         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1237 #endif
1238         .init_req       =       tcp_v4_init_req,
1239 #ifdef CONFIG_SYN_COOKIES
1240         .cookie_init_seq =      cookie_v4_init_sequence,
1241 #endif
1242         .route_req      =       tcp_v4_route_req,
1243         .init_seq       =       tcp_v4_init_sequence,
1244         .send_synack    =       tcp_v4_send_synack,
1245 };
1246
1247 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1248 {
1249         /* Never answer to SYNs send to broadcast or multicast */
1250         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1251                 goto drop;
1252
1253         return tcp_conn_request(&tcp_request_sock_ops,
1254                                 &tcp_request_sock_ipv4_ops, sk, skb);
1255
1256 drop:
1257         tcp_listendrop(sk);
1258         return 0;
1259 }
1260 EXPORT_SYMBOL(tcp_v4_conn_request);
1261
1262
1263 /*
1264  * The three way handshake has completed - we got a valid synack -
1265  * now create the new socket.
1266  */
1267 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1268                                   struct request_sock *req,
1269                                   struct dst_entry *dst,
1270                                   struct request_sock *req_unhash,
1271                                   bool *own_req)
1272 {
1273         struct inet_request_sock *ireq;
1274         struct inet_sock *newinet;
1275         struct tcp_sock *newtp;
1276         struct sock *newsk;
1277 #ifdef CONFIG_TCP_MD5SIG
1278         struct tcp_md5sig_key *key;
1279 #endif
1280         struct ip_options_rcu *inet_opt;
1281
1282         if (sk_acceptq_is_full(sk))
1283                 goto exit_overflow;
1284
1285         newsk = tcp_create_openreq_child(sk, req, skb);
1286         if (!newsk)
1287                 goto exit_nonewsk;
1288
1289         newsk->sk_gso_type = SKB_GSO_TCPV4;
1290         inet_sk_rx_dst_set(newsk, skb);
1291
1292         newtp                 = tcp_sk(newsk);
1293         newinet               = inet_sk(newsk);
1294         ireq                  = inet_rsk(req);
1295         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1296         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1297         newsk->sk_bound_dev_if = ireq->ir_iif;
1298         newinet->inet_saddr           = ireq->ir_loc_addr;
1299         inet_opt              = ireq->opt;
1300         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1301         ireq->opt             = NULL;
1302         newinet->mc_index     = inet_iif(skb);
1303         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1304         newinet->rcv_tos      = ip_hdr(skb)->tos;
1305         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1306         if (inet_opt)
1307                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1308         newinet->inet_id = newtp->write_seq ^ jiffies;
1309
1310         if (!dst) {
1311                 dst = inet_csk_route_child_sock(sk, newsk, req);
1312                 if (!dst)
1313                         goto put_and_exit;
1314         } else {
1315                 /* syncookie case : see end of cookie_v4_check() */
1316         }
1317         sk_setup_caps(newsk, dst);
1318
1319         tcp_ca_openreq_child(newsk, dst);
1320
1321         tcp_sync_mss(newsk, dst_mtu(dst));
1322         newtp->advmss = dst_metric_advmss(dst);
1323         if (tcp_sk(sk)->rx_opt.user_mss &&
1324             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1325                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1326
1327         tcp_initialize_rcv_mss(newsk);
1328
1329 #ifdef CONFIG_TCP_MD5SIG
1330         /* Copy over the MD5 key from the original socket */
1331         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1332                                 AF_INET);
1333         if (key) {
1334                 /*
1335                  * We're using one, so create a matching key
1336                  * on the newsk structure. If we fail to get
1337                  * memory, then we end up not copying the key
1338                  * across. Shucks.
1339                  */
1340                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1341                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1342                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1343         }
1344 #endif
1345
1346         if (__inet_inherit_port(sk, newsk) < 0)
1347                 goto put_and_exit;
1348         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1349         if (*own_req)
1350                 tcp_move_syn(newtp, req);
1351
1352         return newsk;
1353
1354 exit_overflow:
1355         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1356 exit_nonewsk:
1357         dst_release(dst);
1358 exit:
1359         tcp_listendrop(sk);
1360         return NULL;
1361 put_and_exit:
1362         inet_csk_prepare_forced_close(newsk);
1363         tcp_done(newsk);
1364         goto exit;
1365 }
1366 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1367
1368 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1369 {
1370 #ifdef CONFIG_SYN_COOKIES
1371         const struct tcphdr *th = tcp_hdr(skb);
1372
1373         if (!th->syn)
1374                 sk = cookie_v4_check(sk, skb);
1375 #endif
1376         return sk;
1377 }
1378
1379 /* The socket must have it's spinlock held when we get
1380  * here, unless it is a TCP_LISTEN socket.
1381  *
1382  * We have a potential double-lock case here, so even when
1383  * doing backlog processing we use the BH locking scheme.
1384  * This is because we cannot sleep with the original spinlock
1385  * held.
1386  */
1387 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1388 {
1389         struct sock *rsk;
1390
1391         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1392                 struct dst_entry *dst = sk->sk_rx_dst;
1393
1394                 sock_rps_save_rxhash(sk, skb);
1395                 sk_mark_napi_id(sk, skb);
1396                 if (dst) {
1397                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1398                             !dst->ops->check(dst, 0)) {
1399                                 dst_release(dst);
1400                                 sk->sk_rx_dst = NULL;
1401                         }
1402                 }
1403                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1404                 return 0;
1405         }
1406
1407         if (tcp_checksum_complete(skb))
1408                 goto csum_err;
1409
1410         if (sk->sk_state == TCP_LISTEN) {
1411                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1412
1413                 if (!nsk)
1414                         goto discard;
1415                 if (nsk != sk) {
1416                         sock_rps_save_rxhash(nsk, skb);
1417                         sk_mark_napi_id(nsk, skb);
1418                         if (tcp_child_process(sk, nsk, skb)) {
1419                                 rsk = nsk;
1420                                 goto reset;
1421                         }
1422                         return 0;
1423                 }
1424         } else
1425                 sock_rps_save_rxhash(sk, skb);
1426
1427         if (tcp_rcv_state_process(sk, skb)) {
1428                 rsk = sk;
1429                 goto reset;
1430         }
1431         return 0;
1432
1433 reset:
1434         tcp_v4_send_reset(rsk, skb);
1435 discard:
1436         kfree_skb(skb);
1437         /* Be careful here. If this function gets more complicated and
1438          * gcc suffers from register pressure on the x86, sk (in %ebx)
1439          * might be destroyed here. This current version compiles correctly,
1440          * but you have been warned.
1441          */
1442         return 0;
1443
1444 csum_err:
1445         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1446         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1447         goto discard;
1448 }
1449 EXPORT_SYMBOL(tcp_v4_do_rcv);
1450
1451 void tcp_v4_early_demux(struct sk_buff *skb)
1452 {
1453         const struct iphdr *iph;
1454         const struct tcphdr *th;
1455         struct sock *sk;
1456
1457         if (skb->pkt_type != PACKET_HOST)
1458                 return;
1459
1460         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1461                 return;
1462
1463         iph = ip_hdr(skb);
1464         th = tcp_hdr(skb);
1465
1466         if (th->doff < sizeof(struct tcphdr) / 4)
1467                 return;
1468
1469         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1470                                        iph->saddr, th->source,
1471                                        iph->daddr, ntohs(th->dest),
1472                                        skb->skb_iif);
1473         if (sk) {
1474                 skb->sk = sk;
1475                 skb->destructor = sock_edemux;
1476                 if (sk_fullsock(sk)) {
1477                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1478
1479                         if (dst)
1480                                 dst = dst_check(dst, 0);
1481                         if (dst &&
1482                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1483                                 skb_dst_set_noref(skb, dst);
1484                 }
1485         }
1486 }
1487
1488 /* Packet is added to VJ-style prequeue for processing in process
1489  * context, if a reader task is waiting. Apparently, this exciting
1490  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1491  * failed somewhere. Latency? Burstiness? Well, at least now we will
1492  * see, why it failed. 8)8)                               --ANK
1493  *
1494  */
1495 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1496 {
1497         struct tcp_sock *tp = tcp_sk(sk);
1498
1499         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1500                 return false;
1501
1502         if (skb->len <= tcp_hdrlen(skb) &&
1503             skb_queue_len(&tp->ucopy.prequeue) == 0)
1504                 return false;
1505
1506         /* Before escaping RCU protected region, we need to take care of skb
1507          * dst. Prequeue is only enabled for established sockets.
1508          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1509          * Instead of doing full sk_rx_dst validity here, let's perform
1510          * an optimistic check.
1511          */
1512         if (likely(sk->sk_rx_dst))
1513                 skb_dst_drop(skb);
1514         else
1515                 skb_dst_force_safe(skb);
1516
1517         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1518         tp->ucopy.memory += skb->truesize;
1519         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1520             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1521                 struct sk_buff *skb1;
1522
1523                 BUG_ON(sock_owned_by_user(sk));
1524                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1525                                 skb_queue_len(&tp->ucopy.prequeue));
1526
1527                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1528                         sk_backlog_rcv(sk, skb1);
1529
1530                 tp->ucopy.memory = 0;
1531         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1532                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1533                                            POLLIN | POLLRDNORM | POLLRDBAND);
1534                 if (!inet_csk_ack_scheduled(sk))
1535                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1536                                                   (3 * tcp_rto_min(sk)) / 4,
1537                                                   TCP_RTO_MAX);
1538         }
1539         return true;
1540 }
1541 EXPORT_SYMBOL(tcp_prequeue);
1542
1543 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1544 {
1545         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1546
1547         /* Only socket owner can try to collapse/prune rx queues
1548          * to reduce memory overhead, so add a little headroom here.
1549          * Few sockets backlog are possibly concurrently non empty.
1550          */
1551         limit += 64*1024;
1552
1553         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1554          * we can fix skb->truesize to its real value to avoid future drops.
1555          * This is valid because skb is not yet charged to the socket.
1556          * It has been noticed pure SACK packets were sometimes dropped
1557          * (if cooked by drivers without copybreak feature).
1558          */
1559         if (!skb->data_len)
1560                 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1561
1562         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1563                 bh_unlock_sock(sk);
1564                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1565                 return true;
1566         }
1567         return false;
1568 }
1569 EXPORT_SYMBOL(tcp_add_backlog);
1570
1571 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1572 {
1573         struct tcphdr *th = (struct tcphdr *)skb->data;
1574         unsigned int eaten = skb->len;
1575         int err;
1576
1577         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1578         if (!err) {
1579                 eaten -= skb->len;
1580                 TCP_SKB_CB(skb)->end_seq -= eaten;
1581         }
1582         return err;
1583 }
1584 EXPORT_SYMBOL(tcp_filter);
1585
1586 /*
1587  *      From tcp_input.c
1588  */
1589
1590 int tcp_v4_rcv(struct sk_buff *skb)
1591 {
1592         struct net *net = dev_net(skb->dev);
1593         const struct iphdr *iph;
1594         const struct tcphdr *th;
1595         bool refcounted;
1596         struct sock *sk;
1597         int ret;
1598
1599         if (skb->pkt_type != PACKET_HOST)
1600                 goto discard_it;
1601
1602         /* Count it even if it's bad */
1603         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1604
1605         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1606                 goto discard_it;
1607
1608         th = (const struct tcphdr *)skb->data;
1609
1610         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1611                 goto bad_packet;
1612         if (!pskb_may_pull(skb, th->doff * 4))
1613                 goto discard_it;
1614
1615         /* An explanation is required here, I think.
1616          * Packet length and doff are validated by header prediction,
1617          * provided case of th->doff==0 is eliminated.
1618          * So, we defer the checks. */
1619
1620         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1621                 goto csum_error;
1622
1623         th = (const struct tcphdr *)skb->data;
1624         iph = ip_hdr(skb);
1625         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1626          * barrier() makes sure compiler wont play fool^Waliasing games.
1627          */
1628         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1629                 sizeof(struct inet_skb_parm));
1630         barrier();
1631
1632         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1633         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1634                                     skb->len - th->doff * 4);
1635         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1636         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1637         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1638         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1639         TCP_SKB_CB(skb)->sacked  = 0;
1640
1641 lookup:
1642         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1643                                th->dest, &refcounted);
1644         if (!sk)
1645                 goto no_tcp_socket;
1646
1647 process:
1648         if (sk->sk_state == TCP_TIME_WAIT)
1649                 goto do_time_wait;
1650
1651         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1652                 struct request_sock *req = inet_reqsk(sk);
1653                 struct sock *nsk;
1654
1655                 sk = req->rsk_listener;
1656                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1657                         sk_drops_add(sk, skb);
1658                         reqsk_put(req);
1659                         goto discard_it;
1660                 }
1661                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1662                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1663                         goto lookup;
1664                 }
1665                 /* We own a reference on the listener, increase it again
1666                  * as we might lose it too soon.
1667                  */
1668                 sock_hold(sk);
1669                 refcounted = true;
1670                 nsk = tcp_check_req(sk, skb, req, false);
1671                 if (!nsk) {
1672                         reqsk_put(req);
1673                         goto discard_and_relse;
1674                 }
1675                 if (nsk == sk) {
1676                         reqsk_put(req);
1677                 } else if (tcp_child_process(sk, nsk, skb)) {
1678                         tcp_v4_send_reset(nsk, skb);
1679                         goto discard_and_relse;
1680                 } else {
1681                         sock_put(sk);
1682                         return 0;
1683                 }
1684         }
1685         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1686                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1687                 goto discard_and_relse;
1688         }
1689
1690         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1691                 goto discard_and_relse;
1692
1693         if (tcp_v4_inbound_md5_hash(sk, skb))
1694                 goto discard_and_relse;
1695
1696         nf_reset(skb);
1697
1698         if (tcp_filter(sk, skb))
1699                 goto discard_and_relse;
1700         th = (const struct tcphdr *)skb->data;
1701         iph = ip_hdr(skb);
1702
1703         skb->dev = NULL;
1704
1705         if (sk->sk_state == TCP_LISTEN) {
1706                 ret = tcp_v4_do_rcv(sk, skb);
1707                 goto put_and_return;
1708         }
1709
1710         sk_incoming_cpu_update(sk);
1711
1712         bh_lock_sock_nested(sk);
1713         tcp_segs_in(tcp_sk(sk), skb);
1714         ret = 0;
1715         if (!sock_owned_by_user(sk)) {
1716                 if (!tcp_prequeue(sk, skb))
1717                         ret = tcp_v4_do_rcv(sk, skb);
1718         } else if (tcp_add_backlog(sk, skb)) {
1719                 goto discard_and_relse;
1720         }
1721         bh_unlock_sock(sk);
1722
1723 put_and_return:
1724         if (refcounted)
1725                 sock_put(sk);
1726
1727         return ret;
1728
1729 no_tcp_socket:
1730         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1731                 goto discard_it;
1732
1733         if (tcp_checksum_complete(skb)) {
1734 csum_error:
1735                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1736 bad_packet:
1737                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1738         } else {
1739                 tcp_v4_send_reset(NULL, skb);
1740         }
1741
1742 discard_it:
1743         /* Discard frame. */
1744         kfree_skb(skb);
1745         return 0;
1746
1747 discard_and_relse:
1748         sk_drops_add(sk, skb);
1749         if (refcounted)
1750                 sock_put(sk);
1751         goto discard_it;
1752
1753 do_time_wait:
1754         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1755                 inet_twsk_put(inet_twsk(sk));
1756                 goto discard_it;
1757         }
1758
1759         if (tcp_checksum_complete(skb)) {
1760                 inet_twsk_put(inet_twsk(sk));
1761                 goto csum_error;
1762         }
1763         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1764         case TCP_TW_SYN: {
1765                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1766                                                         &tcp_hashinfo, skb,
1767                                                         __tcp_hdrlen(th),
1768                                                         iph->saddr, th->source,
1769                                                         iph->daddr, th->dest,
1770                                                         inet_iif(skb));
1771                 if (sk2) {
1772                         inet_twsk_deschedule_put(inet_twsk(sk));
1773                         sk = sk2;
1774                         refcounted = false;
1775                         goto process;
1776                 }
1777                 /* Fall through to ACK */
1778         }
1779         case TCP_TW_ACK:
1780                 tcp_v4_timewait_ack(sk, skb);
1781                 break;
1782         case TCP_TW_RST:
1783                 tcp_v4_send_reset(sk, skb);
1784                 inet_twsk_deschedule_put(inet_twsk(sk));
1785                 goto discard_it;
1786         case TCP_TW_SUCCESS:;
1787         }
1788         goto discard_it;
1789 }
1790
1791 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1792         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1793         .twsk_unique    = tcp_twsk_unique,
1794         .twsk_destructor= tcp_twsk_destructor,
1795 };
1796
1797 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1798 {
1799         struct dst_entry *dst = skb_dst(skb);
1800
1801         if (dst && dst_hold_safe(dst)) {
1802                 sk->sk_rx_dst = dst;
1803                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1804         }
1805 }
1806 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1807
1808 const struct inet_connection_sock_af_ops ipv4_specific = {
1809         .queue_xmit        = ip_queue_xmit,
1810         .send_check        = tcp_v4_send_check,
1811         .rebuild_header    = inet_sk_rebuild_header,
1812         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1813         .conn_request      = tcp_v4_conn_request,
1814         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1815         .net_header_len    = sizeof(struct iphdr),
1816         .setsockopt        = ip_setsockopt,
1817         .getsockopt        = ip_getsockopt,
1818         .addr2sockaddr     = inet_csk_addr2sockaddr,
1819         .sockaddr_len      = sizeof(struct sockaddr_in),
1820         .bind_conflict     = inet_csk_bind_conflict,
1821 #ifdef CONFIG_COMPAT
1822         .compat_setsockopt = compat_ip_setsockopt,
1823         .compat_getsockopt = compat_ip_getsockopt,
1824 #endif
1825         .mtu_reduced       = tcp_v4_mtu_reduced,
1826 };
1827 EXPORT_SYMBOL(ipv4_specific);
1828
1829 #ifdef CONFIG_TCP_MD5SIG
1830 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1831         .md5_lookup             = tcp_v4_md5_lookup,
1832         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1833         .md5_parse              = tcp_v4_parse_md5_keys,
1834 };
1835 #endif
1836
1837 /* NOTE: A lot of things set to zero explicitly by call to
1838  *       sk_alloc() so need not be done here.
1839  */
1840 static int tcp_v4_init_sock(struct sock *sk)
1841 {
1842         struct inet_connection_sock *icsk = inet_csk(sk);
1843
1844         tcp_init_sock(sk);
1845
1846         icsk->icsk_af_ops = &ipv4_specific;
1847
1848 #ifdef CONFIG_TCP_MD5SIG
1849         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1850 #endif
1851
1852         return 0;
1853 }
1854
1855 void tcp_v4_destroy_sock(struct sock *sk)
1856 {
1857         struct tcp_sock *tp = tcp_sk(sk);
1858
1859         tcp_clear_xmit_timers(sk);
1860
1861         tcp_cleanup_congestion_control(sk);
1862
1863         /* Cleanup up the write buffer. */
1864         tcp_write_queue_purge(sk);
1865
1866         /* Cleans up our, hopefully empty, out_of_order_queue. */
1867         skb_rbtree_purge(&tp->out_of_order_queue);
1868
1869 #ifdef CONFIG_TCP_MD5SIG
1870         /* Clean up the MD5 key list, if any */
1871         if (tp->md5sig_info) {
1872                 tcp_clear_md5_list(sk);
1873                 kfree_rcu(tp->md5sig_info, rcu);
1874                 tp->md5sig_info = NULL;
1875         }
1876 #endif
1877
1878         /* Clean prequeue, it must be empty really */
1879         __skb_queue_purge(&tp->ucopy.prequeue);
1880
1881         /* Clean up a referenced TCP bind bucket. */
1882         if (inet_csk(sk)->icsk_bind_hash)
1883                 inet_put_port(sk);
1884
1885         BUG_ON(tp->fastopen_rsk);
1886
1887         /* If socket is aborted during connect operation */
1888         tcp_free_fastopen_req(tp);
1889         tcp_saved_syn_free(tp);
1890
1891         local_bh_disable();
1892         sk_sockets_allocated_dec(sk);
1893         local_bh_enable();
1894 }
1895 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1896
1897 #ifdef CONFIG_PROC_FS
1898 /* Proc filesystem TCP sock list dumping. */
1899
1900 /*
1901  * Get next listener socket follow cur.  If cur is NULL, get first socket
1902  * starting from bucket given in st->bucket; when st->bucket is zero the
1903  * very first socket in the hash table is returned.
1904  */
1905 static void *listening_get_next(struct seq_file *seq, void *cur)
1906 {
1907         struct tcp_iter_state *st = seq->private;
1908         struct net *net = seq_file_net(seq);
1909         struct inet_listen_hashbucket *ilb;
1910         struct sock *sk = cur;
1911
1912         if (!sk) {
1913 get_head:
1914                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1915                 spin_lock(&ilb->lock);
1916                 sk = sk_head(&ilb->head);
1917                 st->offset = 0;
1918                 goto get_sk;
1919         }
1920         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1921         ++st->num;
1922         ++st->offset;
1923
1924         sk = sk_next(sk);
1925 get_sk:
1926         sk_for_each_from(sk) {
1927                 if (!net_eq(sock_net(sk), net))
1928                         continue;
1929                 if (sk->sk_family == st->family)
1930                         return sk;
1931         }
1932         spin_unlock(&ilb->lock);
1933         st->offset = 0;
1934         if (++st->bucket < INET_LHTABLE_SIZE)
1935                 goto get_head;
1936         return NULL;
1937 }
1938
1939 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1940 {
1941         struct tcp_iter_state *st = seq->private;
1942         void *rc;
1943
1944         st->bucket = 0;
1945         st->offset = 0;
1946         rc = listening_get_next(seq, NULL);
1947
1948         while (rc && *pos) {
1949                 rc = listening_get_next(seq, rc);
1950                 --*pos;
1951         }
1952         return rc;
1953 }
1954
1955 static inline bool empty_bucket(const struct tcp_iter_state *st)
1956 {
1957         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1958 }
1959
1960 /*
1961  * Get first established socket starting from bucket given in st->bucket.
1962  * If st->bucket is zero, the very first socket in the hash is returned.
1963  */
1964 static void *established_get_first(struct seq_file *seq)
1965 {
1966         struct tcp_iter_state *st = seq->private;
1967         struct net *net = seq_file_net(seq);
1968         void *rc = NULL;
1969
1970         st->offset = 0;
1971         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1972                 struct sock *sk;
1973                 struct hlist_nulls_node *node;
1974                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1975
1976                 /* Lockless fast path for the common case of empty buckets */
1977                 if (empty_bucket(st))
1978                         continue;
1979
1980                 spin_lock_bh(lock);
1981                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1982                         if (sk->sk_family != st->family ||
1983                             !net_eq(sock_net(sk), net)) {
1984                                 continue;
1985                         }
1986                         rc = sk;
1987                         goto out;
1988                 }
1989                 spin_unlock_bh(lock);
1990         }
1991 out:
1992         return rc;
1993 }
1994
1995 static void *established_get_next(struct seq_file *seq, void *cur)
1996 {
1997         struct sock *sk = cur;
1998         struct hlist_nulls_node *node;
1999         struct tcp_iter_state *st = seq->private;
2000         struct net *net = seq_file_net(seq);
2001
2002         ++st->num;
2003         ++st->offset;
2004
2005         sk = sk_nulls_next(sk);
2006
2007         sk_nulls_for_each_from(sk, node) {
2008                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2009                         return sk;
2010         }
2011
2012         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2013         ++st->bucket;
2014         return established_get_first(seq);
2015 }
2016
2017 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2018 {
2019         struct tcp_iter_state *st = seq->private;
2020         void *rc;
2021
2022         st->bucket = 0;
2023         rc = established_get_first(seq);
2024
2025         while (rc && pos) {
2026                 rc = established_get_next(seq, rc);
2027                 --pos;
2028         }
2029         return rc;
2030 }
2031
2032 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2033 {
2034         void *rc;
2035         struct tcp_iter_state *st = seq->private;
2036
2037         st->state = TCP_SEQ_STATE_LISTENING;
2038         rc        = listening_get_idx(seq, &pos);
2039
2040         if (!rc) {
2041                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2042                 rc        = established_get_idx(seq, pos);
2043         }
2044
2045         return rc;
2046 }
2047
2048 static void *tcp_seek_last_pos(struct seq_file *seq)
2049 {
2050         struct tcp_iter_state *st = seq->private;
2051         int offset = st->offset;
2052         int orig_num = st->num;
2053         void *rc = NULL;
2054
2055         switch (st->state) {
2056         case TCP_SEQ_STATE_LISTENING:
2057                 if (st->bucket >= INET_LHTABLE_SIZE)
2058                         break;
2059                 st->state = TCP_SEQ_STATE_LISTENING;
2060                 rc = listening_get_next(seq, NULL);
2061                 while (offset-- && rc)
2062                         rc = listening_get_next(seq, rc);
2063                 if (rc)
2064                         break;
2065                 st->bucket = 0;
2066                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2067                 /* Fallthrough */
2068         case TCP_SEQ_STATE_ESTABLISHED:
2069                 if (st->bucket > tcp_hashinfo.ehash_mask)
2070                         break;
2071                 rc = established_get_first(seq);
2072                 while (offset-- && rc)
2073                         rc = established_get_next(seq, rc);
2074         }
2075
2076         st->num = orig_num;
2077
2078         return rc;
2079 }
2080
2081 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2082 {
2083         struct tcp_iter_state *st = seq->private;
2084         void *rc;
2085
2086         if (*pos && *pos == st->last_pos) {
2087                 rc = tcp_seek_last_pos(seq);
2088                 if (rc)
2089                         goto out;
2090         }
2091
2092         st->state = TCP_SEQ_STATE_LISTENING;
2093         st->num = 0;
2094         st->bucket = 0;
2095         st->offset = 0;
2096         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2097
2098 out:
2099         st->last_pos = *pos;
2100         return rc;
2101 }
2102
2103 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2104 {
2105         struct tcp_iter_state *st = seq->private;
2106         void *rc = NULL;
2107
2108         if (v == SEQ_START_TOKEN) {
2109                 rc = tcp_get_idx(seq, 0);
2110                 goto out;
2111         }
2112
2113         switch (st->state) {
2114         case TCP_SEQ_STATE_LISTENING:
2115                 rc = listening_get_next(seq, v);
2116                 if (!rc) {
2117                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2118                         st->bucket = 0;
2119                         st->offset = 0;
2120                         rc        = established_get_first(seq);
2121                 }
2122                 break;
2123         case TCP_SEQ_STATE_ESTABLISHED:
2124                 rc = established_get_next(seq, v);
2125                 break;
2126         }
2127 out:
2128         ++*pos;
2129         st->last_pos = *pos;
2130         return rc;
2131 }
2132
2133 static void tcp_seq_stop(struct seq_file *seq, void *v)
2134 {
2135         struct tcp_iter_state *st = seq->private;
2136
2137         switch (st->state) {
2138         case TCP_SEQ_STATE_LISTENING:
2139                 if (v != SEQ_START_TOKEN)
2140                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2141                 break;
2142         case TCP_SEQ_STATE_ESTABLISHED:
2143                 if (v)
2144                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2145                 break;
2146         }
2147 }
2148
2149 int tcp_seq_open(struct inode *inode, struct file *file)
2150 {
2151         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2152         struct tcp_iter_state *s;
2153         int err;
2154
2155         err = seq_open_net(inode, file, &afinfo->seq_ops,
2156                           sizeof(struct tcp_iter_state));
2157         if (err < 0)
2158                 return err;
2159
2160         s = ((struct seq_file *)file->private_data)->private;
2161         s->family               = afinfo->family;
2162         s->last_pos             = 0;
2163         return 0;
2164 }
2165 EXPORT_SYMBOL(tcp_seq_open);
2166
2167 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2168 {
2169         int rc = 0;
2170         struct proc_dir_entry *p;
2171
2172         afinfo->seq_ops.start           = tcp_seq_start;
2173         afinfo->seq_ops.next            = tcp_seq_next;
2174         afinfo->seq_ops.stop            = tcp_seq_stop;
2175
2176         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2177                              afinfo->seq_fops, afinfo);
2178         if (!p)
2179                 rc = -ENOMEM;
2180         return rc;
2181 }
2182 EXPORT_SYMBOL(tcp_proc_register);
2183
2184 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2185 {
2186         remove_proc_entry(afinfo->name, net->proc_net);
2187 }
2188 EXPORT_SYMBOL(tcp_proc_unregister);
2189
2190 static void get_openreq4(const struct request_sock *req,
2191                          struct seq_file *f, int i)
2192 {
2193         const struct inet_request_sock *ireq = inet_rsk(req);
2194         long delta = req->rsk_timer.expires - jiffies;
2195
2196         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2197                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2198                 i,
2199                 ireq->ir_loc_addr,
2200                 ireq->ir_num,
2201                 ireq->ir_rmt_addr,
2202                 ntohs(ireq->ir_rmt_port),
2203                 TCP_SYN_RECV,
2204                 0, 0, /* could print option size, but that is af dependent. */
2205                 1,    /* timers active (only the expire timer) */
2206                 jiffies_delta_to_clock_t(delta),
2207                 req->num_timeout,
2208                 from_kuid_munged(seq_user_ns(f),
2209                                  sock_i_uid(req->rsk_listener)),
2210                 0,  /* non standard timer */
2211                 0, /* open_requests have no inode */
2212                 0,
2213                 req);
2214 }
2215
2216 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2217 {
2218         int timer_active;
2219         unsigned long timer_expires;
2220         const struct tcp_sock *tp = tcp_sk(sk);
2221         const struct inet_connection_sock *icsk = inet_csk(sk);
2222         const struct inet_sock *inet = inet_sk(sk);
2223         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2224         __be32 dest = inet->inet_daddr;
2225         __be32 src = inet->inet_rcv_saddr;
2226         __u16 destp = ntohs(inet->inet_dport);
2227         __u16 srcp = ntohs(inet->inet_sport);
2228         int rx_queue;
2229         int state;
2230
2231         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2232             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2233             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2234                 timer_active    = 1;
2235                 timer_expires   = icsk->icsk_timeout;
2236         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2237                 timer_active    = 4;
2238                 timer_expires   = icsk->icsk_timeout;
2239         } else if (timer_pending(&sk->sk_timer)) {
2240                 timer_active    = 2;
2241                 timer_expires   = sk->sk_timer.expires;
2242         } else {
2243                 timer_active    = 0;
2244                 timer_expires = jiffies;
2245         }
2246
2247         state = sk_state_load(sk);
2248         if (state == TCP_LISTEN)
2249                 rx_queue = sk->sk_ack_backlog;
2250         else
2251                 /* Because we don't lock the socket,
2252                  * we might find a transient negative value.
2253                  */
2254                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2255
2256         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2257                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2258                 i, src, srcp, dest, destp, state,
2259                 tp->write_seq - tp->snd_una,
2260                 rx_queue,
2261                 timer_active,
2262                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2263                 icsk->icsk_retransmits,
2264                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2265                 icsk->icsk_probes_out,
2266                 sock_i_ino(sk),
2267                 atomic_read(&sk->sk_refcnt), sk,
2268                 jiffies_to_clock_t(icsk->icsk_rto),
2269                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2270                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2271                 tp->snd_cwnd,
2272                 state == TCP_LISTEN ?
2273                     fastopenq->max_qlen :
2274                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2275 }
2276
2277 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2278                                struct seq_file *f, int i)
2279 {
2280         long delta = tw->tw_timer.expires - jiffies;
2281         __be32 dest, src;
2282         __u16 destp, srcp;
2283
2284         dest  = tw->tw_daddr;
2285         src   = tw->tw_rcv_saddr;
2286         destp = ntohs(tw->tw_dport);
2287         srcp  = ntohs(tw->tw_sport);
2288
2289         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2290                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2291                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2292                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2293                 atomic_read(&tw->tw_refcnt), tw);
2294 }
2295
2296 #define TMPSZ 150
2297
2298 static int tcp4_seq_show(struct seq_file *seq, void *v)
2299 {
2300         struct tcp_iter_state *st;
2301         struct sock *sk = v;
2302
2303         seq_setwidth(seq, TMPSZ - 1);
2304         if (v == SEQ_START_TOKEN) {
2305                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2306                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2307                            "inode");
2308                 goto out;
2309         }
2310         st = seq->private;
2311
2312         if (sk->sk_state == TCP_TIME_WAIT)
2313                 get_timewait4_sock(v, seq, st->num);
2314         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2315                 get_openreq4(v, seq, st->num);
2316         else
2317                 get_tcp4_sock(v, seq, st->num);
2318 out:
2319         seq_pad(seq, '\n');
2320         return 0;
2321 }
2322
2323 static const struct file_operations tcp_afinfo_seq_fops = {
2324         .owner   = THIS_MODULE,
2325         .open    = tcp_seq_open,
2326         .read    = seq_read,
2327         .llseek  = seq_lseek,
2328         .release = seq_release_net
2329 };
2330
2331 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2332         .name           = "tcp",
2333         .family         = AF_INET,
2334         .seq_fops       = &tcp_afinfo_seq_fops,
2335         .seq_ops        = {
2336                 .show           = tcp4_seq_show,
2337         },
2338 };
2339
2340 static int __net_init tcp4_proc_init_net(struct net *net)
2341 {
2342         return tcp_proc_register(net, &tcp4_seq_afinfo);
2343 }
2344
2345 static void __net_exit tcp4_proc_exit_net(struct net *net)
2346 {
2347         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2348 }
2349
2350 static struct pernet_operations tcp4_net_ops = {
2351         .init = tcp4_proc_init_net,
2352         .exit = tcp4_proc_exit_net,
2353 };
2354
2355 int __init tcp4_proc_init(void)
2356 {
2357         return register_pernet_subsys(&tcp4_net_ops);
2358 }
2359
2360 void tcp4_proc_exit(void)
2361 {
2362         unregister_pernet_subsys(&tcp4_net_ops);
2363 }
2364 #endif /* CONFIG_PROC_FS */
2365
2366 struct proto tcp_prot = {
2367         .name                   = "TCP",
2368         .owner                  = THIS_MODULE,
2369         .close                  = tcp_close,
2370         .connect                = tcp_v4_connect,
2371         .disconnect             = tcp_disconnect,
2372         .accept                 = inet_csk_accept,
2373         .ioctl                  = tcp_ioctl,
2374         .init                   = tcp_v4_init_sock,
2375         .destroy                = tcp_v4_destroy_sock,
2376         .shutdown               = tcp_shutdown,
2377         .setsockopt             = tcp_setsockopt,
2378         .getsockopt             = tcp_getsockopt,
2379         .recvmsg                = tcp_recvmsg,
2380         .sendmsg                = tcp_sendmsg,
2381         .sendpage               = tcp_sendpage,
2382         .backlog_rcv            = tcp_v4_do_rcv,
2383         .release_cb             = tcp_release_cb,
2384         .hash                   = inet_hash,
2385         .unhash                 = inet_unhash,
2386         .get_port               = inet_csk_get_port,
2387         .enter_memory_pressure  = tcp_enter_memory_pressure,
2388         .stream_memory_free     = tcp_stream_memory_free,
2389         .sockets_allocated      = &tcp_sockets_allocated,
2390         .orphan_count           = &tcp_orphan_count,
2391         .memory_allocated       = &tcp_memory_allocated,
2392         .memory_pressure        = &tcp_memory_pressure,
2393         .sysctl_mem             = sysctl_tcp_mem,
2394         .sysctl_wmem            = sysctl_tcp_wmem,
2395         .sysctl_rmem            = sysctl_tcp_rmem,
2396         .max_header             = MAX_TCP_HEADER,
2397         .obj_size               = sizeof(struct tcp_sock),
2398         .slab_flags             = SLAB_DESTROY_BY_RCU,
2399         .twsk_prot              = &tcp_timewait_sock_ops,
2400         .rsk_prot               = &tcp_request_sock_ops,
2401         .h.hashinfo             = &tcp_hashinfo,
2402         .no_autobind            = true,
2403 #ifdef CONFIG_COMPAT
2404         .compat_setsockopt      = compat_tcp_setsockopt,
2405         .compat_getsockopt      = compat_tcp_getsockopt,
2406 #endif
2407         .diag_destroy           = tcp_abort,
2408 };
2409 EXPORT_SYMBOL(tcp_prot);
2410
2411 static void __net_exit tcp_sk_exit(struct net *net)
2412 {
2413         int cpu;
2414
2415         for_each_possible_cpu(cpu)
2416                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2417         free_percpu(net->ipv4.tcp_sk);
2418 }
2419
2420 static int __net_init tcp_sk_init(struct net *net)
2421 {
2422         int res, cpu;
2423
2424         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2425         if (!net->ipv4.tcp_sk)
2426                 return -ENOMEM;
2427
2428         for_each_possible_cpu(cpu) {
2429                 struct sock *sk;
2430
2431                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2432                                            IPPROTO_TCP, net);
2433                 if (res)
2434                         goto fail;
2435                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2436                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2437         }
2438
2439         net->ipv4.sysctl_tcp_ecn = 2;
2440         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2441
2442         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2443         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2444         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2445
2446         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2447         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2448         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2449
2450         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2451         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2452         net->ipv4.sysctl_tcp_syncookies = 1;
2453         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2454         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2455         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2456         net->ipv4.sysctl_tcp_orphan_retries = 0;
2457         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2458         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2459
2460         return 0;
2461 fail:
2462         tcp_sk_exit(net);
2463
2464         return res;
2465 }
2466
2467 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2468 {
2469         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2470 }
2471
2472 static struct pernet_operations __net_initdata tcp_sk_ops = {
2473        .init       = tcp_sk_init,
2474        .exit       = tcp_sk_exit,
2475        .exit_batch = tcp_sk_exit_batch,
2476 };
2477
2478 void __init tcp_v4_init(void)
2479 {
2480         inet_hashinfo_init(&tcp_hashinfo);
2481         if (register_pernet_subsys(&tcp_sk_ops))
2482                 panic("Failed to create the TCP control socket.\n");
2483 }