net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_tw_reuse __read_mostly;
  88 int sysctl_tcp_low_latency __read_mostly;
  89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  94 #endif
  95
  96 struct inet_hashinfo tcp_hashinfo;
  97 EXPORT_SYMBOL(tcp_hashinfo);
  98
  99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 100 {
 101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 102                                           ip_hdr(skb)->saddr,
 103                                           tcp_hdr(skb)->dest,
 104                                           tcp_hdr(skb)->source);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111
 112         /* With PAWS, it is safe from the viewpoint
 113            of data integrity. Even without PAWS it is safe provided sequence
 114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 115
 116            Actually, the idea is close to VJ's one, only timestamp cache is
 117            held not per host, but per port pair and TW bucket is used as state
 118            holder.
 119
 120            If TW bucket has been already destroyed we fall back to VJ's scheme
 121            and use initial timestamp retrieved from peer table.
 122          */
 123         if (tcptw->tw_ts_recent_stamp &&
 124             (!twp || (sysctl_tcp_tw_reuse &&
 125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 127                 if (tp->write_seq == 0)
 128                         tp->write_seq = 1;
 129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 131                 sock_hold(sktw);
 132                 return 1;
 133         }
 134
 135         return 0;
 136 }
 137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 138
 139 /* This will initiate an outgoing connection. */
 140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 141 {
 142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 143         struct inet_sock *inet = inet_sk(sk);
 144         struct tcp_sock *tp = tcp_sk(sk);
 145         __be16 orig_sport, orig_dport;
 146         __be32 daddr, nexthop;
 147         struct flowi4 *fl4;
 148         struct rtable *rt;
 149         int err;
 150         struct ip_options_rcu *inet_opt;
 151
 152         if (addr_len < sizeof(struct sockaddr_in))
 153                 return -EINVAL;
 154
 155         if (usin->sin_family != AF_INET)
 156                 return -EAFNOSUPPORT;
 157
 158         nexthop = daddr = usin->sin_addr.s_addr;
 159         inet_opt = rcu_dereference_protected(inet->inet_opt,
 160                                              lockdep_sock_is_held(sk));
 161         if (inet_opt && inet_opt->opt.srr) {
 162                 if (!daddr)
 163                         return -EINVAL;
 164                 nexthop = inet_opt->opt.faddr;
 165         }
 166
 167         orig_sport = inet->inet_sport;
 168         orig_dport = usin->sin_port;
 169         fl4 = &inet->cork.fl.u.ip4;
 170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                               IPPROTO_TCP,
 173                               orig_sport, orig_dport, sk);
 174         if (IS_ERR(rt)) {
 175                 err = PTR_ERR(rt);
 176                 if (err == -ENETUNREACH)
 177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                 return err;
 179         }
 180
 181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                 ip_rt_put(rt);
 183                 return -ENETUNREACH;
 184         }
 185
 186         if (!inet_opt || !inet_opt->opt.srr)
 187                 daddr = fl4->daddr;
 188
 189         if (!inet->inet_saddr)
 190                 inet->inet_saddr = fl4->saddr;
 191         sk_rcv_saddr_set(sk, inet->inet_saddr);
 192
 193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 194                 /* Reset inherited state */
 195                 tp->rx_opt.ts_recent       = 0;
 196                 tp->rx_opt.ts_recent_stamp = 0;
 197                 if (likely(!tp->repair))
 198                         tp->write_seq      = 0;
 199         }
 200
 201         if (tcp_death_row.sysctl_tw_recycle &&
 202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 204
 205         inet->inet_dport = usin->sin_port;
 206         sk_daddr_set(sk, daddr);
 207
 208         inet_csk(sk)->icsk_ext_hdr_len = 0;
 209         if (inet_opt)
 210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 211
 212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 213
 214         /* Socket identity is still unknown (sport may be zero).
 215          * However we set state to SYN-SENT and not releasing socket
 216          * lock select source port, enter ourselves into the hash tables and
 217          * complete initialization after this.
 218          */
 219         tcp_set_state(sk, TCP_SYN_SENT);
 220         err = inet_hash_connect(&tcp_death_row, sk);
 221         if (err)
 222                 goto failure;
 223
 224         sk_set_txhash(sk);
 225
 226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                                inet->inet_sport, inet->inet_dport, sk);
 228         if (IS_ERR(rt)) {
 229                 err = PTR_ERR(rt);
 230                 rt = NULL;
 231                 goto failure;
 232         }
 233         /* OK, now commit destination to socket.  */
 234         sk->sk_gso_type = SKB_GSO_TCPV4;
 235         sk_setup_caps(sk, &rt->dst);
 236
 237         if (!tp->write_seq && likely(!tp->repair))
 238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239                                                            inet->inet_daddr,
 240                                                            inet->inet_sport,
 241                                                            usin->sin_port);
 242
 243         inet->inet_id = tp->write_seq ^ jiffies;
 244
 245         err = tcp_connect(sk);
 246
 247         rt = NULL;
 248         if (err)
 249                 goto failure;
 250
 251         return 0;
 252
 253 failure:
 254         /*
 255          * This unhashes the socket and releases the local port,
 256          * if necessary.
 257          */
 258         tcp_set_state(sk, TCP_CLOSE);
 259         ip_rt_put(rt);
 260         sk->sk_route_caps = 0;
 261         inet->inet_dport = 0;
 262         return err;
 263 }
 264 EXPORT_SYMBOL(tcp_v4_connect);
 265
 266 /*
 267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268  * It can be called through tcp_release_cb() if socket was owned by user
 269  * at the time tcp_v4_err() was called to handle ICMP message.
 270  */
 271 void tcp_v4_mtu_reduced(struct sock *sk)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275         u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277         dst = inet_csk_update_pmtu(sk, mtu);
 278         if (!dst)
 279                 return;
 280
 281         /* Something is about to be wrong... Remember soft error
 282          * for the case, if this connection will not able to recover.
 283          */
 284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285                 sk->sk_err_soft = EMSGSIZE;
 286
 287         mtu = dst_mtu(dst);
 288
 289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290             ip_sk_accept_pmtu(sk) &&
 291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 292                 tcp_sync_mss(sk, mtu);
 293
 294                 /* Resend the TCP packet because it's
 295                  * clear that the old packet has been
 296                  * dropped. This is the new "fast" path mtu
 297                  * discovery.
 298                  */
 299                 tcp_simple_retransmit(sk);
 300         } /* else let the usual retransmit timer handle it */
 301 }
 302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 303
 304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 305 {
 306         struct dst_entry *dst = __sk_dst_check(sk, 0);
 307
 308         if (dst)
 309                 dst->ops->redirect(dst, sk, skb);
 310 }
 311
 312
 313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 315 {
 316         struct request_sock *req = inet_reqsk(sk);
 317         struct net *net = sock_net(sk);
 318
 319         /* ICMPs are not backlogged, hence we cannot get
 320          * an established socket here.
 321          */
 322         if (seq != tcp_rsk(req)->snt_isn) {
 323                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 324         } else if (abort) {
 325                 /*
 326                  * Still in SYN_RECV, just remove it silently.
 327                  * There is no good way to pass the error to the newly
 328                  * created socket, and POSIX does not want network
 329                  * errors returned from accept().
 330                  */
 331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 332                 tcp_listendrop(req->rsk_listener);
 333         }
 334         reqsk_put(req);
 335 }
 336 EXPORT_SYMBOL(tcp_req_err);
 337
 338 /*
 339  * This routine is called by the ICMP module when it gets some
 340  * sort of error condition.  If err < 0 then the socket should
 341  * be closed and the error returned to the user.  If err > 0
 342  * it's just the icmp type << 8 | icmp code.  After adjustment
 343  * header points to the first 8 bytes of the tcp header.  We need
 344  * to find the appropriate port.
 345  *
 346  * The locking strategy used here is very "optimistic". When
 347  * someone else accesses the socket the ICMP is just dropped
 348  * and for some paths there is no check at all.
 349  * A more general error queue to queue errors for later handling
 350  * is probably better.
 351  *
 352  */
 353
 354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 355 {
 356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 358         struct inet_connection_sock *icsk;
 359         struct tcp_sock *tp;
 360         struct inet_sock *inet;
 361         const int type = icmp_hdr(icmp_skb)->type;
 362         const int code = icmp_hdr(icmp_skb)->code;
 363         struct sock *sk;
 364         struct sk_buff *skb;
 365         struct request_sock *fastopen;
 366         __u32 seq, snd_una;
 367         __u32 remaining;
 368         int err;
 369         struct net *net = dev_net(icmp_skb->dev);
 370
 371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 372                                        th->dest, iph->saddr, ntohs(th->source),
 373                                        inet_iif(icmp_skb));
 374         if (!sk) {
 375                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 376                 return;
 377         }
 378         if (sk->sk_state == TCP_TIME_WAIT) {
 379                 inet_twsk_put(inet_twsk(sk));
 380                 return;
 381         }
 382         seq = ntohl(th->seq);
 383         if (sk->sk_state == TCP_NEW_SYN_RECV)
 384                 return tcp_req_err(sk, seq,
 385                                   type == ICMP_PARAMETERPROB ||
 386                                   type == ICMP_TIME_EXCEEDED ||
 387                                   (type == ICMP_DEST_UNREACH &&
 388                                    (code == ICMP_NET_UNREACH ||
 389                                     code == ICMP_HOST_UNREACH)));
 390
 391         bh_lock_sock(sk);
 392         /* If too many ICMPs get dropped on busy
 393          * servers this needs to be solved differently.
 394          * We do take care of PMTU discovery (RFC1191) special case :
 395          * we can receive locally generated ICMP messages while socket is held.
 396          */
 397         if (sock_owned_by_user(sk)) {
 398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 399                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 400         }
 401         if (sk->sk_state == TCP_CLOSE)
 402                 goto out;
 403
 404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 405                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 406                 goto out;
 407         }
 408
 409         icsk = inet_csk(sk);
 410         tp = tcp_sk(sk);
 411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 412         fastopen = tp->fastopen_rsk;
 413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 414         if (sk->sk_state != TCP_LISTEN &&
 415             !between(seq, snd_una, tp->snd_nxt)) {
 416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 417                 goto out;
 418         }
 419
 420         switch (type) {
 421         case ICMP_REDIRECT:
 422                 do_redirect(icmp_skb, sk);
 423                 goto out;
 424         case ICMP_SOURCE_QUENCH:
 425                 /* Just silently ignore these. */
 426                 goto out;
 427         case ICMP_PARAMETERPROB:
 428                 err = EPROTO;
 429                 break;
 430         case ICMP_DEST_UNREACH:
 431                 if (code > NR_ICMP_UNREACH)
 432                         goto out;
 433
 434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 435                         /* We are not interested in TCP_LISTEN and open_requests
 436                          * (SYN-ACKs send out by Linux are always <576bytes so
 437                          * they should go through unfragmented).
 438                          */
 439                         if (sk->sk_state == TCP_LISTEN)
 440                                 goto out;
 441
 442                         tp->mtu_info = info;
 443                         if (!sock_owned_by_user(sk)) {
 444                                 tcp_v4_mtu_reduced(sk);
 445                         } else {
 446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 447                                         sock_hold(sk);
 448                         }
 449                         goto out;
 450                 }
 451
 452                 err = icmp_err_convert[code].errno;
 453                 /* check if icmp_skb allows revert of backoff
 454                  * (see draft-zimmermann-tcp-lcd) */
 455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 456                         break;
 457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 458                     !icsk->icsk_backoff || fastopen)
 459                         break;
 460
 461                 if (sock_owned_by_user(sk))
 462                         break;
 463
 464                 icsk->icsk_backoff--;
 465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 466                                                TCP_TIMEOUT_INIT;
 467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 468
 469                 skb = tcp_write_queue_head(sk);
 470                 BUG_ON(!skb);
 471
 472                 remaining = icsk->icsk_rto -
 473                             min(icsk->icsk_rto,
 474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 475
 476                 if (remaining) {
 477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 478                                                   remaining, TCP_RTO_MAX);
 479                 } else {
 480                         /* RTO revert clocked out retransmission.
 481                          * Will retransmit now */
 482                         tcp_retransmit_timer(sk);
 483                 }
 484
 485                 break;
 486         case ICMP_TIME_EXCEEDED:
 487                 err = EHOSTUNREACH;
 488                 break;
 489         default:
 490                 goto out;
 491         }
 492
 493         switch (sk->sk_state) {
 494         case TCP_SYN_SENT:
 495         case TCP_SYN_RECV:
 496                 /* Only in fast or simultaneous open. If a fast open socket is
 497                  * is already accepted it is treated as a connected one below.
 498                  */
 499                 if (fastopen && !fastopen->sk)
 500                         break;
 501
 502                 if (!sock_owned_by_user(sk)) {
 503                         sk->sk_err = err;
 504
 505                         sk->sk_error_report(sk);
 506
 507                         tcp_done(sk);
 508                 } else {
 509                         sk->sk_err_soft = err;
 510                 }
 511                 goto out;
 512         }
 513
 514         /* If we've already connected we will keep trying
 515          * until we time out, or the user gives up.
 516          *
 517          * rfc1122 4.2.3.9 allows to consider as hard errors
 518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 519          * but it is obsoleted by pmtu discovery).
 520          *
 521          * Note, that in modern internet, where routing is unreliable
 522          * and in each dark corner broken firewalls sit, sending random
 523          * errors ordered by their masters even this two messages finally lose
 524          * their original sense (even Linux sends invalid PORT_UNREACHs)
 525          *
 526          * Now we are in compliance with RFCs.
 527          *                                                      --ANK (980905)
 528          */
 529
 530         inet = inet_sk(sk);
 531         if (!sock_owned_by_user(sk) && inet->recverr) {
 532                 sk->sk_err = err;
 533                 sk->sk_error_report(sk);
 534         } else  { /* Only an error on timeout */
 535                 sk->sk_err_soft = err;
 536         }
 537
 538 out:
 539         bh_unlock_sock(sk);
 540         sock_put(sk);
 541 }
 542
 543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 544 {
 545         struct tcphdr *th = tcp_hdr(skb);
 546
 547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 549                 skb->csum_start = skb_transport_header(skb) - skb->head;
 550                 skb->csum_offset = offsetof(struct tcphdr, check);
 551         } else {
 552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 553                                          csum_partial(th,
 554                                                       th->doff << 2,
 555                                                       skb->csum));
 556         }
 557 }
 558
 559 /* This routine computes an IPv4 TCP checksum. */
 560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 561 {
 562         const struct inet_sock *inet = inet_sk(sk);
 563
 564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 565 }
 566 EXPORT_SYMBOL(tcp_v4_send_check);
 567
 568 /*
 569  *      This routine will send an RST to the other tcp.
 570  *
 571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 572  *                    for reset.
 573  *      Answer: if a packet caused RST, it is not for a socket
 574  *              existing in our system, if it is matched to a socket,
 575  *              it is just duplicate segment or bug in other side's TCP.
 576  *              So that we build reply only basing on parameters
 577  *              arrived with segment.
 578  *      Exception: precedence violation. We do not implement it in any case.
 579  */
 580
 581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 582 {
 583         const struct tcphdr *th = tcp_hdr(skb);
 584         struct {
 585                 struct tcphdr th;
 586 #ifdef CONFIG_TCP_MD5SIG
 587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 588 #endif
 589         } rep;
 590         struct ip_reply_arg arg;
 591 #ifdef CONFIG_TCP_MD5SIG
 592         struct tcp_md5sig_key *key = NULL;
 593         const __u8 *hash_location = NULL;
 594         unsigned char newhash[16];
 595         int genhash;
 596         struct sock *sk1 = NULL;
 597 #endif
 598         struct net *net;
 599
 600         /* Never send a reset in response to a reset. */
 601         if (th->rst)
 602                 return;
 603
 604         /* If sk not NULL, it means we did a successful lookup and incoming
 605          * route had to be correct. prequeue might have dropped our dst.
 606          */
 607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 608                 return;
 609
 610         /* Swap the send and the receive. */
 611         memset(&rep, 0, sizeof(rep));
 612         rep.th.dest   = th->source;
 613         rep.th.source = th->dest;
 614         rep.th.doff   = sizeof(struct tcphdr) / 4;
 615         rep.th.rst    = 1;
 616
 617         if (th->ack) {
 618                 rep.th.seq = th->ack_seq;
 619         } else {
 620                 rep.th.ack = 1;
 621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 622                                        skb->len - (th->doff << 2));
 623         }
 624
 625         memset(&arg, 0, sizeof(arg));
 626         arg.iov[0].iov_base = (unsigned char *)&rep;
 627         arg.iov[0].iov_len  = sizeof(rep.th);
 628
 629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 630 #ifdef CONFIG_TCP_MD5SIG
 631         rcu_read_lock();
 632         hash_location = tcp_parse_md5sig_option(th);
 633         if (sk && sk_fullsock(sk)) {
 634                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 635                                         &ip_hdr(skb)->saddr, AF_INET);
 636         } else if (hash_location) {
 637                 /*
 638                  * active side is lost. Try to find listening socket through
 639                  * source port, and then find md5 key through listening socket.
 640                  * we are not loose security here:
 641                  * Incoming packet is checked with md5 hash with finding key,
 642                  * no RST generated if md5 hash doesn't match.
 643                  */
 644                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 645                                              ip_hdr(skb)->saddr,
 646                                              th->source, ip_hdr(skb)->daddr,
 647                                              ntohs(th->source), inet_iif(skb));
 648                 /* don't send rst if it can't find key */
 649                 if (!sk1)
 650                         goto out;
 651
 652                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 653                                         &ip_hdr(skb)->saddr, AF_INET);
 654                 if (!key)
 655                         goto out;
 656
 657
 658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660                         goto out;
 661
 662         }
 663
 664         if (key) {
 665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 666                                    (TCPOPT_NOP << 16) |
 667                                    (TCPOPT_MD5SIG << 8) |
 668                                    TCPOLEN_MD5SIG);
 669                 /* Update length and the length the header thinks exists */
 670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 671                 rep.th.doff = arg.iov[0].iov_len / 4;
 672
 673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 674                                      key, ip_hdr(skb)->saddr,
 675                                      ip_hdr(skb)->daddr, &rep.th);
 676         }
 677 #endif
 678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 679                                       ip_hdr(skb)->saddr, /* XXX */
 680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 682         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 683
 684         /* When socket is gone, all binding information is lost.
 685          * routing might fail in this case. No choice here, if we choose to force
 686          * input interface, we will misroute in case of asymmetric route.
 687          */
 688         if (sk)
 689                 arg.bound_dev_if = sk->sk_bound_dev_if;
 690
 691         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 692                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 693
 694         arg.tos = ip_hdr(skb)->tos;
 695         local_bh_disable();
 696         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 697                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 698                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 699                               &arg, arg.iov[0].iov_len);
 700
 701         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 702         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 703         local_bh_enable();
 704
 705 #ifdef CONFIG_TCP_MD5SIG
 706 out:
 707         rcu_read_unlock();
 708 #endif
 709 }
 710
 711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 712    outside socket context is ugly, certainly. What can I do?
 713  */
 714
 715 static void tcp_v4_send_ack(struct net *net,
 716                             struct sk_buff *skb, u32 seq, u32 ack,
 717                             u32 win, u32 tsval, u32 tsecr, int oif,
 718                             struct tcp_md5sig_key *key,
 719                             int reply_flags, u8 tos)
 720 {
 721         const struct tcphdr *th = tcp_hdr(skb);
 722         struct {
 723                 struct tcphdr th;
 724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 725 #ifdef CONFIG_TCP_MD5SIG
 726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 727 #endif
 728                         ];
 729         } rep;
 730         struct ip_reply_arg arg;
 731
 732         memset(&rep.th, 0, sizeof(struct tcphdr));
 733         memset(&arg, 0, sizeof(arg));
 734
 735         arg.iov[0].iov_base = (unsigned char *)&rep;
 736         arg.iov[0].iov_len  = sizeof(rep.th);
 737         if (tsecr) {
 738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 739                                    (TCPOPT_TIMESTAMP << 8) |
 740                                    TCPOLEN_TIMESTAMP);
 741                 rep.opt[1] = htonl(tsval);
 742                 rep.opt[2] = htonl(tsecr);
 743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 744         }
 745
 746         /* Swap the send and the receive. */
 747         rep.th.dest    = th->source;
 748         rep.th.source  = th->dest;
 749         rep.th.doff    = arg.iov[0].iov_len / 4;
 750         rep.th.seq     = htonl(seq);
 751         rep.th.ack_seq = htonl(ack);
 752         rep.th.ack     = 1;
 753         rep.th.window  = htons(win);
 754
 755 #ifdef CONFIG_TCP_MD5SIG
 756         if (key) {
 757                 int offset = (tsecr) ? 3 : 0;
 758
 759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 760                                           (TCPOPT_NOP << 16) |
 761                                           (TCPOPT_MD5SIG << 8) |
 762                                           TCPOLEN_MD5SIG);
 763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 764                 rep.th.doff = arg.iov[0].iov_len/4;
 765
 766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 767                                     key, ip_hdr(skb)->saddr,
 768                                     ip_hdr(skb)->daddr, &rep.th);
 769         }
 770 #endif
 771         arg.flags = reply_flags;
 772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 773                                       ip_hdr(skb)->saddr, /* XXX */
 774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 776         if (oif)
 777                 arg.bound_dev_if = oif;
 778         arg.tos = tos;
 779         local_bh_disable();
 780         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 781                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 782                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 783                               &arg, arg.iov[0].iov_len);
 784
 785         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 786         local_bh_enable();
 787 }
 788
 789 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 790 {
 791         struct inet_timewait_sock *tw = inet_twsk(sk);
 792         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 793
 794         tcp_v4_send_ack(sock_net(sk), skb,
 795                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 796                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 797                         tcp_time_stamp + tcptw->tw_ts_offset,
 798                         tcptw->tw_ts_recent,
 799                         tw->tw_bound_dev_if,
 800                         tcp_twsk_md5_key(tcptw),
 801                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 802                         tw->tw_tos
 803                         );
 804
 805         inet_twsk_put(tw);
 806 }
 807
 808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 809                                   struct request_sock *req)
 810 {
 811         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 812          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 813          */
 814         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 815                                              tcp_sk(sk)->snd_nxt;
 816
 817         /* RFC 7323 2.3
 818          * The window field (SEG.WND) of every outgoing segment, with the
 819          * exception of <SYN> segments, MUST be right-shifted by
 820          * Rcv.Wind.Shift bits:
 821          */
 822         tcp_v4_send_ack(sock_net(sk), skb, seq,
 823                         tcp_rsk(req)->rcv_nxt,
 824                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 825                         tcp_time_stamp,
 826                         req->ts_recent,
 827                         0,
 828                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 829                                           AF_INET),
 830                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 831                         ip_hdr(skb)->tos);
 832 }
 833
 834 /*
 835  *      Send a SYN-ACK after having received a SYN.
 836  *      This still operates on a request_sock only, not on a big
 837  *      socket.
 838  */
 839 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 840                               struct flowi *fl,
 841                               struct request_sock *req,
 842                               struct tcp_fastopen_cookie *foc,
 843                               enum tcp_synack_type synack_type)
 844 {
 845         const struct inet_request_sock *ireq = inet_rsk(req);
 846         struct flowi4 fl4;
 847         int err = -1;
 848         struct sk_buff *skb;
 849
 850         /* First, grab a route. */
 851         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 852                 return -1;
 853
 854         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 855
 856         if (skb) {
 857                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 858
 859                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 860                                             ireq->ir_rmt_addr,
 861                                             ireq->opt);
 862                 err = net_xmit_eval(err);
 863         }
 864
 865         return err;
 866 }
 867
 868 /*
 869  *      IPv4 request_sock destructor.
 870  */
 871 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 872 {
 873         kfree(inet_rsk(req)->opt);
 874 }
 875
 876 #ifdef CONFIG_TCP_MD5SIG
 877 /*
 878  * RFC2385 MD5 checksumming requires a mapping of
 879  * IP address->MD5 Key.
 880  * We need to maintain these in the sk structure.
 881  */
 882
 883 /* Find the Key structure for an address.  */
 884 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 885                                          const union tcp_md5_addr *addr,
 886                                          int family)
 887 {
 888         const struct tcp_sock *tp = tcp_sk(sk);
 889         struct tcp_md5sig_key *key;
 890         unsigned int size = sizeof(struct in_addr);
 891         const struct tcp_md5sig_info *md5sig;
 892
 893         /* caller either holds rcu_read_lock() or socket lock */
 894         md5sig = rcu_dereference_check(tp->md5sig_info,
 895                                        lockdep_sock_is_held(sk));
 896         if (!md5sig)
 897                 return NULL;
 898 #if IS_ENABLED(CONFIG_IPV6)
 899         if (family == AF_INET6)
 900                 size = sizeof(struct in6_addr);
 901 #endif
 902         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 903                 if (key->family != family)
 904                         continue;
 905                 if (!memcmp(&key->addr, addr, size))
 906                         return key;
 907         }
 908         return NULL;
 909 }
 910 EXPORT_SYMBOL(tcp_md5_do_lookup);
 911
 912 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 913                                          const struct sock *addr_sk)
 914 {
 915         const union tcp_md5_addr *addr;
 916
 917         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 918         return tcp_md5_do_lookup(sk, addr, AF_INET);
 919 }
 920 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 921
 922 /* This can be called on a newly created socket, from other files */
 923 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 924                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 925 {
 926         /* Add Key to the list */
 927         struct tcp_md5sig_key *key;
 928         struct tcp_sock *tp = tcp_sk(sk);
 929         struct tcp_md5sig_info *md5sig;
 930
 931         key = tcp_md5_do_lookup(sk, addr, family);
 932         if (key) {
 933                 /* Pre-existing entry - just update that one. */
 934                 memcpy(key->key, newkey, newkeylen);
 935                 key->keylen = newkeylen;
 936                 return 0;
 937         }
 938
 939         md5sig = rcu_dereference_protected(tp->md5sig_info,
 940                                            lockdep_sock_is_held(sk));
 941         if (!md5sig) {
 942                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 943                 if (!md5sig)
 944                         return -ENOMEM;
 945
 946                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 947                 INIT_HLIST_HEAD(&md5sig->head);
 948                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 949         }
 950
 951         key = sock_kmalloc(sk, sizeof(*key), gfp);
 952         if (!key)
 953                 return -ENOMEM;
 954         if (!tcp_alloc_md5sig_pool()) {
 955                 sock_kfree_s(sk, key, sizeof(*key));
 956                 return -ENOMEM;
 957         }
 958
 959         memcpy(key->key, newkey, newkeylen);
 960         key->keylen = newkeylen;
 961         key->family = family;
 962         memcpy(&key->addr, addr,
 963                (family == AF_INET6) ? sizeof(struct in6_addr) :
 964                                       sizeof(struct in_addr));
 965         hlist_add_head_rcu(&key->node, &md5sig->head);
 966         return 0;
 967 }
 968 EXPORT_SYMBOL(tcp_md5_do_add);
 969
 970 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 971 {
 972         struct tcp_md5sig_key *key;
 973
 974         key = tcp_md5_do_lookup(sk, addr, family);
 975         if (!key)
 976                 return -ENOENT;
 977         hlist_del_rcu(&key->node);
 978         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 979         kfree_rcu(key, rcu);
 980         return 0;
 981 }
 982 EXPORT_SYMBOL(tcp_md5_do_del);
 983
 984 static void tcp_clear_md5_list(struct sock *sk)
 985 {
 986         struct tcp_sock *tp = tcp_sk(sk);
 987         struct tcp_md5sig_key *key;
 988         struct hlist_node *n;
 989         struct tcp_md5sig_info *md5sig;
 990
 991         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 992
 993         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 994                 hlist_del_rcu(&key->node);
 995                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 996                 kfree_rcu(key, rcu);
 997         }
 998 }
 999
1000 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1001                                  int optlen)
1002 {
1003         struct tcp_md5sig cmd;
1004         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1005
1006         if (optlen < sizeof(cmd))
1007                 return -EINVAL;
1008
1009         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1010                 return -EFAULT;
1011
1012         if (sin->sin_family != AF_INET)
1013                 return -EINVAL;
1014
1015         if (!cmd.tcpm_keylen)
1016                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017                                       AF_INET);
1018
1019         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1020                 return -EINVAL;
1021
1022         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1024                               GFP_KERNEL);
1025 }
1026
1027 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1028                                    __be32 daddr, __be32 saddr,
1029                                    const struct tcphdr *th, int nbytes)
1030 {
1031         struct tcp4_pseudohdr *bp;
1032         struct scatterlist sg;
1033         struct tcphdr *_th;
1034
1035         bp = hp->scratch;
1036         bp->saddr = saddr;
1037         bp->daddr = daddr;
1038         bp->pad = 0;
1039         bp->protocol = IPPROTO_TCP;
1040         bp->len = cpu_to_be16(nbytes);
1041
1042         _th = (struct tcphdr *)(bp + 1);
1043         memcpy(_th, th, sizeof(*th));
1044         _th->check = 0;
1045
1046         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1047         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1048                                 sizeof(*bp) + sizeof(*th));
1049         return crypto_ahash_update(hp->md5_req);
1050 }
1051
1052 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1053                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1054 {
1055         struct tcp_md5sig_pool *hp;
1056         struct ahash_request *req;
1057
1058         hp = tcp_get_md5sig_pool();
1059         if (!hp)
1060                 goto clear_hash_noput;
1061         req = hp->md5_req;
1062
1063         if (crypto_ahash_init(req))
1064                 goto clear_hash;
1065         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1066                 goto clear_hash;
1067         if (tcp_md5_hash_key(hp, key))
1068                 goto clear_hash;
1069         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1070         if (crypto_ahash_final(req))
1071                 goto clear_hash;
1072
1073         tcp_put_md5sig_pool();
1074         return 0;
1075
1076 clear_hash:
1077         tcp_put_md5sig_pool();
1078 clear_hash_noput:
1079         memset(md5_hash, 0, 16);
1080         return 1;
1081 }
1082
1083 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1084                         const struct sock *sk,
1085                         const struct sk_buff *skb)
1086 {
1087         struct tcp_md5sig_pool *hp;
1088         struct ahash_request *req;
1089         const struct tcphdr *th = tcp_hdr(skb);
1090         __be32 saddr, daddr;
1091
1092         if (sk) { /* valid for establish/request sockets */
1093                 saddr = sk->sk_rcv_saddr;
1094                 daddr = sk->sk_daddr;
1095         } else {
1096                 const struct iphdr *iph = ip_hdr(skb);
1097                 saddr = iph->saddr;
1098                 daddr = iph->daddr;
1099         }
1100
1101         hp = tcp_get_md5sig_pool();
1102         if (!hp)
1103                 goto clear_hash_noput;
1104         req = hp->md5_req;
1105
1106         if (crypto_ahash_init(req))
1107                 goto clear_hash;
1108
1109         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1110                 goto clear_hash;
1111         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1112                 goto clear_hash;
1113         if (tcp_md5_hash_key(hp, key))
1114                 goto clear_hash;
1115         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1116         if (crypto_ahash_final(req))
1117                 goto clear_hash;
1118
1119         tcp_put_md5sig_pool();
1120         return 0;
1121
1122 clear_hash:
1123         tcp_put_md5sig_pool();
1124 clear_hash_noput:
1125         memset(md5_hash, 0, 16);
1126         return 1;
1127 }
1128 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1129
1130 #endif
1131
1132 /* Called with rcu_read_lock() */
1133 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1134                                     const struct sk_buff *skb)
1135 {
1136 #ifdef CONFIG_TCP_MD5SIG
1137         /*
1138          * This gets called for each TCP segment that arrives
1139          * so we want to be efficient.
1140          * We have 3 drop cases:
1141          * o No MD5 hash and one expected.
1142          * o MD5 hash and we're not expecting one.
1143          * o MD5 hash and its wrong.
1144          */
1145         const __u8 *hash_location = NULL;
1146         struct tcp_md5sig_key *hash_expected;
1147         const struct iphdr *iph = ip_hdr(skb);
1148         const struct tcphdr *th = tcp_hdr(skb);
1149         int genhash;
1150         unsigned char newhash[16];
1151
1152         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1153                                           AF_INET);
1154         hash_location = tcp_parse_md5sig_option(th);
1155
1156         /* We've parsed the options - do we have a hash? */
1157         if (!hash_expected && !hash_location)
1158                 return false;
1159
1160         if (hash_expected && !hash_location) {
1161                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1162                 return true;
1163         }
1164
1165         if (!hash_expected && hash_location) {
1166                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1167                 return true;
1168         }
1169
1170         /* Okay, so this is hash_expected and hash_location -
1171          * so we need to calculate the checksum.
1172          */
1173         genhash = tcp_v4_md5_hash_skb(newhash,
1174                                       hash_expected,
1175                                       NULL, skb);
1176
1177         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1178                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1179                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1180                                      &iph->saddr, ntohs(th->source),
1181                                      &iph->daddr, ntohs(th->dest),
1182                                      genhash ? " tcp_v4_calc_md5_hash failed"
1183                                      : "");
1184                 return true;
1185         }
1186         return false;
1187 #endif
1188         return false;
1189 }
1190
1191 static void tcp_v4_init_req(struct request_sock *req,
1192                             const struct sock *sk_listener,
1193                             struct sk_buff *skb)
1194 {
1195         struct inet_request_sock *ireq = inet_rsk(req);
1196
1197         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1198         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1199         ireq->opt = tcp_v4_save_options(skb);
1200 }
1201
1202 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1203                                           struct flowi *fl,
1204                                           const struct request_sock *req,
1205                                           bool *strict)
1206 {
1207         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1208
1209         if (strict) {
1210                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1211                         *strict = true;
1212                 else
1213                         *strict = false;
1214         }
1215
1216         return dst;
1217 }
1218
1219 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1220         .family         =       PF_INET,
1221         .obj_size       =       sizeof(struct tcp_request_sock),
1222         .rtx_syn_ack    =       tcp_rtx_synack,
1223         .send_ack       =       tcp_v4_reqsk_send_ack,
1224         .destructor     =       tcp_v4_reqsk_destructor,
1225         .send_reset     =       tcp_v4_send_reset,
1226         .syn_ack_timeout =      tcp_syn_ack_timeout,
1227 };
1228
1229 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1230         .mss_clamp      =       TCP_MSS_DEFAULT,
1231 #ifdef CONFIG_TCP_MD5SIG
1232         .req_md5_lookup =       tcp_v4_md5_lookup,
1233         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1234 #endif
1235         .init_req       =       tcp_v4_init_req,
1236 #ifdef CONFIG_SYN_COOKIES
1237         .cookie_init_seq =      cookie_v4_init_sequence,
1238 #endif
1239         .route_req      =       tcp_v4_route_req,
1240         .init_seq       =       tcp_v4_init_sequence,
1241         .send_synack    =       tcp_v4_send_synack,
1242 };
1243
1244 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1245 {
1246         /* Never answer to SYNs send to broadcast or multicast */
1247         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1248                 goto drop;
1249
1250         return tcp_conn_request(&tcp_request_sock_ops,
1251                                 &tcp_request_sock_ipv4_ops, sk, skb);
1252
1253 drop:
1254         tcp_listendrop(sk);
1255         return 0;
1256 }
1257 EXPORT_SYMBOL(tcp_v4_conn_request);
1258
1259
1260 /*
1261  * The three way handshake has completed - we got a valid synack -
1262  * now create the new socket.
1263  */
1264 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1265                                   struct request_sock *req,
1266                                   struct dst_entry *dst,
1267                                   struct request_sock *req_unhash,
1268                                   bool *own_req)
1269 {
1270         struct inet_request_sock *ireq;
1271         struct inet_sock *newinet;
1272         struct tcp_sock *newtp;
1273         struct sock *newsk;
1274 #ifdef CONFIG_TCP_MD5SIG
1275         struct tcp_md5sig_key *key;
1276 #endif
1277         struct ip_options_rcu *inet_opt;
1278
1279         if (sk_acceptq_is_full(sk))
1280                 goto exit_overflow;
1281
1282         newsk = tcp_create_openreq_child(sk, req, skb);
1283         if (!newsk)
1284                 goto exit_nonewsk;
1285
1286         newsk->sk_gso_type = SKB_GSO_TCPV4;
1287         inet_sk_rx_dst_set(newsk, skb);
1288
1289         newtp                 = tcp_sk(newsk);
1290         newinet               = inet_sk(newsk);
1291         ireq                  = inet_rsk(req);
1292         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1293         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1294         newsk->sk_bound_dev_if = ireq->ir_iif;
1295         newinet->inet_saddr           = ireq->ir_loc_addr;
1296         inet_opt              = ireq->opt;
1297         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1298         ireq->opt             = NULL;
1299         newinet->mc_index     = inet_iif(skb);
1300         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1301         newinet->rcv_tos      = ip_hdr(skb)->tos;
1302         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1303         if (inet_opt)
1304                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1305         newinet->inet_id = newtp->write_seq ^ jiffies;
1306
1307         if (!dst) {
1308                 dst = inet_csk_route_child_sock(sk, newsk, req);
1309                 if (!dst)
1310                         goto put_and_exit;
1311         } else {
1312                 /* syncookie case : see end of cookie_v4_check() */
1313         }
1314         sk_setup_caps(newsk, dst);
1315
1316         tcp_ca_openreq_child(newsk, dst);
1317
1318         tcp_sync_mss(newsk, dst_mtu(dst));
1319         newtp->advmss = dst_metric_advmss(dst);
1320         if (tcp_sk(sk)->rx_opt.user_mss &&
1321             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1322                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1323
1324         tcp_initialize_rcv_mss(newsk);
1325
1326 #ifdef CONFIG_TCP_MD5SIG
1327         /* Copy over the MD5 key from the original socket */
1328         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1329                                 AF_INET);
1330         if (key) {
1331                 /*
1332                  * We're using one, so create a matching key
1333                  * on the newsk structure. If we fail to get
1334                  * memory, then we end up not copying the key
1335                  * across. Shucks.
1336                  */
1337                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1338                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1339                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1340         }
1341 #endif
1342
1343         if (__inet_inherit_port(sk, newsk) < 0)
1344                 goto put_and_exit;
1345         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1346         if (*own_req)
1347                 tcp_move_syn(newtp, req);
1348
1349         return newsk;
1350
1351 exit_overflow:
1352         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1353 exit_nonewsk:
1354         dst_release(dst);
1355 exit:
1356         tcp_listendrop(sk);
1357         return NULL;
1358 put_and_exit:
1359         inet_csk_prepare_forced_close(newsk);
1360         tcp_done(newsk);
1361         goto exit;
1362 }
1363 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1364
1365 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1366 {
1367 #ifdef CONFIG_SYN_COOKIES
1368         const struct tcphdr *th = tcp_hdr(skb);
1369
1370         if (!th->syn)
1371                 sk = cookie_v4_check(sk, skb);
1372 #endif
1373         return sk;
1374 }
1375
1376 /* The socket must have it's spinlock held when we get
1377  * here, unless it is a TCP_LISTEN socket.
1378  *
1379  * We have a potential double-lock case here, so even when
1380  * doing backlog processing we use the BH locking scheme.
1381  * This is because we cannot sleep with the original spinlock
1382  * held.
1383  */
1384 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1385 {
1386         struct sock *rsk;
1387
1388         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1389                 struct dst_entry *dst = sk->sk_rx_dst;
1390
1391                 sock_rps_save_rxhash(sk, skb);
1392                 sk_mark_napi_id(sk, skb);
1393                 if (dst) {
1394                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1395                             !dst->ops->check(dst, 0)) {
1396                                 dst_release(dst);
1397                                 sk->sk_rx_dst = NULL;
1398                         }
1399                 }
1400                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1401                 return 0;
1402         }
1403
1404         if (tcp_checksum_complete(skb))
1405                 goto csum_err;
1406
1407         if (sk->sk_state == TCP_LISTEN) {
1408                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1409
1410                 if (!nsk)
1411                         goto discard;
1412                 if (nsk != sk) {
1413                         sock_rps_save_rxhash(nsk, skb);
1414                         sk_mark_napi_id(nsk, skb);
1415                         if (tcp_child_process(sk, nsk, skb)) {
1416                                 rsk = nsk;
1417                                 goto reset;
1418                         }
1419                         return 0;
1420                 }
1421         } else
1422                 sock_rps_save_rxhash(sk, skb);
1423
1424         if (tcp_rcv_state_process(sk, skb)) {
1425                 rsk = sk;
1426                 goto reset;
1427         }
1428         return 0;
1429
1430 reset:
1431         tcp_v4_send_reset(rsk, skb);
1432 discard:
1433         kfree_skb(skb);
1434         /* Be careful here. If this function gets more complicated and
1435          * gcc suffers from register pressure on the x86, sk (in %ebx)
1436          * might be destroyed here. This current version compiles correctly,
1437          * but you have been warned.
1438          */
1439         return 0;
1440
1441 csum_err:
1442         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1443         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1444         goto discard;
1445 }
1446 EXPORT_SYMBOL(tcp_v4_do_rcv);
1447
1448 void tcp_v4_early_demux(struct sk_buff *skb)
1449 {
1450         const struct iphdr *iph;
1451         const struct tcphdr *th;
1452         struct sock *sk;
1453
1454         if (skb->pkt_type != PACKET_HOST)
1455                 return;
1456
1457         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1458                 return;
1459
1460         iph = ip_hdr(skb);
1461         th = tcp_hdr(skb);
1462
1463         if (th->doff < sizeof(struct tcphdr) / 4)
1464                 return;
1465
1466         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1467                                        iph->saddr, th->source,
1468                                        iph->daddr, ntohs(th->dest),
1469                                        skb->skb_iif);
1470         if (sk) {
1471                 skb->sk = sk;
1472                 skb->destructor = sock_edemux;
1473                 if (sk_fullsock(sk)) {
1474                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1475
1476                         if (dst)
1477                                 dst = dst_check(dst, 0);
1478                         if (dst &&
1479                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1480                                 skb_dst_set_noref(skb, dst);
1481                 }
1482         }
1483 }
1484
1485 /* Packet is added to VJ-style prequeue for processing in process
1486  * context, if a reader task is waiting. Apparently, this exciting
1487  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1488  * failed somewhere. Latency? Burstiness? Well, at least now we will
1489  * see, why it failed. 8)8)                               --ANK
1490  *
1491  */
1492 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1493 {
1494         struct tcp_sock *tp = tcp_sk(sk);
1495
1496         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1497                 return false;
1498
1499         if (skb->len <= tcp_hdrlen(skb) &&
1500             skb_queue_len(&tp->ucopy.prequeue) == 0)
1501                 return false;
1502
1503         /* Before escaping RCU protected region, we need to take care of skb
1504          * dst. Prequeue is only enabled for established sockets.
1505          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1506          * Instead of doing full sk_rx_dst validity here, let's perform
1507          * an optimistic check.
1508          */
1509         if (likely(sk->sk_rx_dst))
1510                 skb_dst_drop(skb);
1511         else
1512                 skb_dst_force_safe(skb);
1513
1514         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1515         tp->ucopy.memory += skb->truesize;
1516         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1517             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1518                 struct sk_buff *skb1;
1519
1520                 BUG_ON(sock_owned_by_user(sk));
1521                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1522                                 skb_queue_len(&tp->ucopy.prequeue));
1523
1524                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1525                         sk_backlog_rcv(sk, skb1);
1526
1527                 tp->ucopy.memory = 0;
1528         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1529                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1530                                            POLLIN | POLLRDNORM | POLLRDBAND);
1531                 if (!inet_csk_ack_scheduled(sk))
1532                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1533                                                   (3 * tcp_rto_min(sk)) / 4,
1534                                                   TCP_RTO_MAX);
1535         }
1536         return true;
1537 }
1538 EXPORT_SYMBOL(tcp_prequeue);
1539
1540 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1541 {
1542         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1543
1544         /* Only socket owner can try to collapse/prune rx queues
1545          * to reduce memory overhead, so add a little headroom here.
1546          * Few sockets backlog are possibly concurrently non empty.
1547          */
1548         limit += 64*1024;
1549
1550         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1551          * we can fix skb->truesize to its real value to avoid future drops.
1552          * This is valid because skb is not yet charged to the socket.
1553          * It has been noticed pure SACK packets were sometimes dropped
1554          * (if cooked by drivers without copybreak feature).
1555          */
1556         if (!skb->data_len)
1557                 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1558
1559         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1560                 bh_unlock_sock(sk);
1561                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1562                 return true;
1563         }
1564         return false;
1565 }
1566 EXPORT_SYMBOL(tcp_add_backlog);
1567
1568 /*
1569  *      From tcp_input.c
1570  */
1571
1572 int tcp_v4_rcv(struct sk_buff *skb)
1573 {
1574         struct net *net = dev_net(skb->dev);
1575         const struct iphdr *iph;
1576         const struct tcphdr *th;
1577         bool refcounted;
1578         struct sock *sk;
1579         int ret;
1580
1581         if (skb->pkt_type != PACKET_HOST)
1582                 goto discard_it;
1583
1584         /* Count it even if it's bad */
1585         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1586
1587         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1588                 goto discard_it;
1589
1590         th = (const struct tcphdr *)skb->data;
1591
1592         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1593                 goto bad_packet;
1594         if (!pskb_may_pull(skb, th->doff * 4))
1595                 goto discard_it;
1596
1597         /* An explanation is required here, I think.
1598          * Packet length and doff are validated by header prediction,
1599          * provided case of th->doff==0 is eliminated.
1600          * So, we defer the checks. */
1601
1602         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1603                 goto csum_error;
1604
1605         th = (const struct tcphdr *)skb->data;
1606         iph = ip_hdr(skb);
1607         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1608          * barrier() makes sure compiler wont play fool^Waliasing games.
1609          */
1610         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1611                 sizeof(struct inet_skb_parm));
1612         barrier();
1613
1614         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1615         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1616                                     skb->len - th->doff * 4);
1617         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1618         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1619         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1620         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1621         TCP_SKB_CB(skb)->sacked  = 0;
1622
1623 lookup:
1624         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1625                                th->dest, &refcounted);
1626         if (!sk)
1627                 goto no_tcp_socket;
1628
1629 process:
1630         if (sk->sk_state == TCP_TIME_WAIT)
1631                 goto do_time_wait;
1632
1633         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1634                 struct request_sock *req = inet_reqsk(sk);
1635                 struct sock *nsk;
1636
1637                 sk = req->rsk_listener;
1638                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1639                         sk_drops_add(sk, skb);
1640                         reqsk_put(req);
1641                         goto discard_it;
1642                 }
1643                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1644                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1645                         goto lookup;
1646                 }
1647                 /* We own a reference on the listener, increase it again
1648                  * as we might lose it too soon.
1649                  */
1650                 sock_hold(sk);
1651                 refcounted = true;
1652                 nsk = tcp_check_req(sk, skb, req, false);
1653                 if (!nsk) {
1654                         reqsk_put(req);
1655                         goto discard_and_relse;
1656                 }
1657                 if (nsk == sk) {
1658                         reqsk_put(req);
1659                 } else if (tcp_child_process(sk, nsk, skb)) {
1660                         tcp_v4_send_reset(nsk, skb);
1661                         goto discard_and_relse;
1662                 } else {
1663                         sock_put(sk);
1664                         return 0;
1665                 }
1666         }
1667         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1668                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1669                 goto discard_and_relse;
1670         }
1671
1672         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1673                 goto discard_and_relse;
1674
1675         if (tcp_v4_inbound_md5_hash(sk, skb))
1676                 goto discard_and_relse;
1677
1678         nf_reset(skb);
1679
1680         if (sk_filter(sk, skb))
1681                 goto discard_and_relse;
1682
1683         skb->dev = NULL;
1684
1685         if (sk->sk_state == TCP_LISTEN) {
1686                 ret = tcp_v4_do_rcv(sk, skb);
1687                 goto put_and_return;
1688         }
1689
1690         sk_incoming_cpu_update(sk);
1691
1692         bh_lock_sock_nested(sk);
1693         tcp_segs_in(tcp_sk(sk), skb);
1694         ret = 0;
1695         if (!sock_owned_by_user(sk)) {
1696                 if (!tcp_prequeue(sk, skb))
1697                         ret = tcp_v4_do_rcv(sk, skb);
1698         } else if (tcp_add_backlog(sk, skb)) {
1699                 goto discard_and_relse;
1700         }
1701         bh_unlock_sock(sk);
1702
1703 put_and_return:
1704         if (refcounted)
1705                 sock_put(sk);
1706
1707         return ret;
1708
1709 no_tcp_socket:
1710         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1711                 goto discard_it;
1712
1713         if (tcp_checksum_complete(skb)) {
1714 csum_error:
1715                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1716 bad_packet:
1717                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1718         } else {
1719                 tcp_v4_send_reset(NULL, skb);
1720         }
1721
1722 discard_it:
1723         /* Discard frame. */
1724         kfree_skb(skb);
1725         return 0;
1726
1727 discard_and_relse:
1728         sk_drops_add(sk, skb);
1729         if (refcounted)
1730                 sock_put(sk);
1731         goto discard_it;
1732
1733 do_time_wait:
1734         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1735                 inet_twsk_put(inet_twsk(sk));
1736                 goto discard_it;
1737         }
1738
1739         if (tcp_checksum_complete(skb)) {
1740                 inet_twsk_put(inet_twsk(sk));
1741                 goto csum_error;
1742         }
1743         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1744         case TCP_TW_SYN: {
1745                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1746                                                         &tcp_hashinfo, skb,
1747                                                         __tcp_hdrlen(th),
1748                                                         iph->saddr, th->source,
1749                                                         iph->daddr, th->dest,
1750                                                         inet_iif(skb));
1751                 if (sk2) {
1752                         inet_twsk_deschedule_put(inet_twsk(sk));
1753                         sk = sk2;
1754                         refcounted = false;
1755                         goto process;
1756                 }
1757                 /* Fall through to ACK */
1758         }
1759         case TCP_TW_ACK:
1760                 tcp_v4_timewait_ack(sk, skb);
1761                 break;
1762         case TCP_TW_RST:
1763                 tcp_v4_send_reset(sk, skb);
1764                 inet_twsk_deschedule_put(inet_twsk(sk));
1765                 goto discard_it;
1766         case TCP_TW_SUCCESS:;
1767         }
1768         goto discard_it;
1769 }
1770
1771 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1772         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1773         .twsk_unique    = tcp_twsk_unique,
1774         .twsk_destructor= tcp_twsk_destructor,
1775 };
1776
1777 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1778 {
1779         struct dst_entry *dst = skb_dst(skb);
1780
1781         if (dst && dst_hold_safe(dst)) {
1782                 sk->sk_rx_dst = dst;
1783                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1784         }
1785 }
1786 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1787
1788 const struct inet_connection_sock_af_ops ipv4_specific = {
1789         .queue_xmit        = ip_queue_xmit,
1790         .send_check        = tcp_v4_send_check,
1791         .rebuild_header    = inet_sk_rebuild_header,
1792         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1793         .conn_request      = tcp_v4_conn_request,
1794         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1795         .net_header_len    = sizeof(struct iphdr),
1796         .setsockopt        = ip_setsockopt,
1797         .getsockopt        = ip_getsockopt,
1798         .addr2sockaddr     = inet_csk_addr2sockaddr,
1799         .sockaddr_len      = sizeof(struct sockaddr_in),
1800         .bind_conflict     = inet_csk_bind_conflict,
1801 #ifdef CONFIG_COMPAT
1802         .compat_setsockopt = compat_ip_setsockopt,
1803         .compat_getsockopt = compat_ip_getsockopt,
1804 #endif
1805         .mtu_reduced       = tcp_v4_mtu_reduced,
1806 };
1807 EXPORT_SYMBOL(ipv4_specific);
1808
1809 #ifdef CONFIG_TCP_MD5SIG
1810 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1811         .md5_lookup             = tcp_v4_md5_lookup,
1812         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1813         .md5_parse              = tcp_v4_parse_md5_keys,
1814 };
1815 #endif
1816
1817 /* NOTE: A lot of things set to zero explicitly by call to
1818  *       sk_alloc() so need not be done here.
1819  */
1820 static int tcp_v4_init_sock(struct sock *sk)
1821 {
1822         struct inet_connection_sock *icsk = inet_csk(sk);
1823
1824         tcp_init_sock(sk);
1825
1826         icsk->icsk_af_ops = &ipv4_specific;
1827
1828 #ifdef CONFIG_TCP_MD5SIG
1829         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1830 #endif
1831
1832         return 0;
1833 }
1834
1835 void tcp_v4_destroy_sock(struct sock *sk)
1836 {
1837         struct tcp_sock *tp = tcp_sk(sk);
1838
1839         tcp_clear_xmit_timers(sk);
1840
1841         tcp_cleanup_congestion_control(sk);
1842
1843         /* Cleanup up the write buffer. */
1844         tcp_write_queue_purge(sk);
1845
1846         /* Cleans up our, hopefully empty, out_of_order_queue. */
1847         skb_rbtree_purge(&tp->out_of_order_queue);
1848
1849 #ifdef CONFIG_TCP_MD5SIG
1850         /* Clean up the MD5 key list, if any */
1851         if (tp->md5sig_info) {
1852                 tcp_clear_md5_list(sk);
1853                 kfree_rcu(tp->md5sig_info, rcu);
1854                 tp->md5sig_info = NULL;
1855         }
1856 #endif
1857
1858         /* Clean prequeue, it must be empty really */
1859         __skb_queue_purge(&tp->ucopy.prequeue);
1860
1861         /* Clean up a referenced TCP bind bucket. */
1862         if (inet_csk(sk)->icsk_bind_hash)
1863                 inet_put_port(sk);
1864
1865         BUG_ON(tp->fastopen_rsk);
1866
1867         /* If socket is aborted during connect operation */
1868         tcp_free_fastopen_req(tp);
1869         tcp_saved_syn_free(tp);
1870
1871         local_bh_disable();
1872         sk_sockets_allocated_dec(sk);
1873         local_bh_enable();
1874
1875         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1876                 sock_release_memcg(sk);
1877 }
1878 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1879
1880 #ifdef CONFIG_PROC_FS
1881 /* Proc filesystem TCP sock list dumping. */
1882
1883 /*
1884  * Get next listener socket follow cur.  If cur is NULL, get first socket
1885  * starting from bucket given in st->bucket; when st->bucket is zero the
1886  * very first socket in the hash table is returned.
1887  */
1888 static void *listening_get_next(struct seq_file *seq, void *cur)
1889 {
1890         struct tcp_iter_state *st = seq->private;
1891         struct net *net = seq_file_net(seq);
1892         struct inet_listen_hashbucket *ilb;
1893         struct inet_connection_sock *icsk;
1894         struct sock *sk = cur;
1895
1896         if (!sk) {
1897 get_head:
1898                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1899                 spin_lock_bh(&ilb->lock);
1900                 sk = sk_head(&ilb->head);
1901                 st->offset = 0;
1902                 goto get_sk;
1903         }
1904         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1905         ++st->num;
1906         ++st->offset;
1907
1908         sk = sk_next(sk);
1909 get_sk:
1910         sk_for_each_from(sk) {
1911                 if (!net_eq(sock_net(sk), net))
1912                         continue;
1913                 if (sk->sk_family == st->family)
1914                         return sk;
1915                 icsk = inet_csk(sk);
1916         }
1917         spin_unlock_bh(&ilb->lock);
1918         st->offset = 0;
1919         if (++st->bucket < INET_LHTABLE_SIZE)
1920                 goto get_head;
1921         return NULL;
1922 }
1923
1924 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1925 {
1926         struct tcp_iter_state *st = seq->private;
1927         void *rc;
1928
1929         st->bucket = 0;
1930         st->offset = 0;
1931         rc = listening_get_next(seq, NULL);
1932
1933         while (rc && *pos) {
1934                 rc = listening_get_next(seq, rc);
1935                 --*pos;
1936         }
1937         return rc;
1938 }
1939
1940 static inline bool empty_bucket(const struct tcp_iter_state *st)
1941 {
1942         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1943 }
1944
1945 /*
1946  * Get first established socket starting from bucket given in st->bucket.
1947  * If st->bucket is zero, the very first socket in the hash is returned.
1948  */
1949 static void *established_get_first(struct seq_file *seq)
1950 {
1951         struct tcp_iter_state *st = seq->private;
1952         struct net *net = seq_file_net(seq);
1953         void *rc = NULL;
1954
1955         st->offset = 0;
1956         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1957                 struct sock *sk;
1958                 struct hlist_nulls_node *node;
1959                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1960
1961                 /* Lockless fast path for the common case of empty buckets */
1962                 if (empty_bucket(st))
1963                         continue;
1964
1965                 spin_lock_bh(lock);
1966                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1967                         if (sk->sk_family != st->family ||
1968                             !net_eq(sock_net(sk), net)) {
1969                                 continue;
1970                         }
1971                         rc = sk;
1972                         goto out;
1973                 }
1974                 spin_unlock_bh(lock);
1975         }
1976 out:
1977         return rc;
1978 }
1979
1980 static void *established_get_next(struct seq_file *seq, void *cur)
1981 {
1982         struct sock *sk = cur;
1983         struct hlist_nulls_node *node;
1984         struct tcp_iter_state *st = seq->private;
1985         struct net *net = seq_file_net(seq);
1986
1987         ++st->num;
1988         ++st->offset;
1989
1990         sk = sk_nulls_next(sk);
1991
1992         sk_nulls_for_each_from(sk, node) {
1993                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1994                         return sk;
1995         }
1996
1997         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1998         ++st->bucket;
1999         return established_get_first(seq);
2000 }
2001
2002 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2003 {
2004         struct tcp_iter_state *st = seq->private;
2005         void *rc;
2006
2007         st->bucket = 0;
2008         rc = established_get_first(seq);
2009
2010         while (rc && pos) {
2011                 rc = established_get_next(seq, rc);
2012                 --pos;
2013         }
2014         return rc;
2015 }
2016
2017 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2018 {
2019         void *rc;
2020         struct tcp_iter_state *st = seq->private;
2021
2022         st->state = TCP_SEQ_STATE_LISTENING;
2023         rc        = listening_get_idx(seq, &pos);
2024
2025         if (!rc) {
2026                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2027                 rc        = established_get_idx(seq, pos);
2028         }
2029
2030         return rc;
2031 }
2032
2033 static void *tcp_seek_last_pos(struct seq_file *seq)
2034 {
2035         struct tcp_iter_state *st = seq->private;
2036         int offset = st->offset;
2037         int orig_num = st->num;
2038         void *rc = NULL;
2039
2040         switch (st->state) {
2041         case TCP_SEQ_STATE_LISTENING:
2042                 if (st->bucket >= INET_LHTABLE_SIZE)
2043                         break;
2044                 st->state = TCP_SEQ_STATE_LISTENING;
2045                 rc = listening_get_next(seq, NULL);
2046                 while (offset-- && rc)
2047                         rc = listening_get_next(seq, rc);
2048                 if (rc)
2049                         break;
2050                 st->bucket = 0;
2051                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2052                 /* Fallthrough */
2053         case TCP_SEQ_STATE_ESTABLISHED:
2054                 if (st->bucket > tcp_hashinfo.ehash_mask)
2055                         break;
2056                 rc = established_get_first(seq);
2057                 while (offset-- && rc)
2058                         rc = established_get_next(seq, rc);
2059         }
2060
2061         st->num = orig_num;
2062
2063         return rc;
2064 }
2065
2066 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2067 {
2068         struct tcp_iter_state *st = seq->private;
2069         void *rc;
2070
2071         if (*pos && *pos == st->last_pos) {
2072                 rc = tcp_seek_last_pos(seq);
2073                 if (rc)
2074                         goto out;
2075         }
2076
2077         st->state = TCP_SEQ_STATE_LISTENING;
2078         st->num = 0;
2079         st->bucket = 0;
2080         st->offset = 0;
2081         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2082
2083 out:
2084         st->last_pos = *pos;
2085         return rc;
2086 }
2087
2088 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2089 {
2090         struct tcp_iter_state *st = seq->private;
2091         void *rc = NULL;
2092
2093         if (v == SEQ_START_TOKEN) {
2094                 rc = tcp_get_idx(seq, 0);
2095                 goto out;
2096         }
2097
2098         switch (st->state) {
2099         case TCP_SEQ_STATE_LISTENING:
2100                 rc = listening_get_next(seq, v);
2101                 if (!rc) {
2102                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2103                         st->bucket = 0;
2104                         st->offset = 0;
2105                         rc        = established_get_first(seq);
2106                 }
2107                 break;
2108         case TCP_SEQ_STATE_ESTABLISHED:
2109                 rc = established_get_next(seq, v);
2110                 break;
2111         }
2112 out:
2113         ++*pos;
2114         st->last_pos = *pos;
2115         return rc;
2116 }
2117
2118 static void tcp_seq_stop(struct seq_file *seq, void *v)
2119 {
2120         struct tcp_iter_state *st = seq->private;
2121
2122         switch (st->state) {
2123         case TCP_SEQ_STATE_LISTENING:
2124                 if (v != SEQ_START_TOKEN)
2125                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2126                 break;
2127         case TCP_SEQ_STATE_ESTABLISHED:
2128                 if (v)
2129                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2130                 break;
2131         }
2132 }
2133
2134 int tcp_seq_open(struct inode *inode, struct file *file)
2135 {
2136         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2137         struct tcp_iter_state *s;
2138         int err;
2139
2140         err = seq_open_net(inode, file, &afinfo->seq_ops,
2141                           sizeof(struct tcp_iter_state));
2142         if (err < 0)
2143                 return err;
2144
2145         s = ((struct seq_file *)file->private_data)->private;
2146         s->family               = afinfo->family;
2147         s->last_pos             = 0;
2148         return 0;
2149 }
2150 EXPORT_SYMBOL(tcp_seq_open);
2151
2152 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2153 {
2154         int rc = 0;
2155         struct proc_dir_entry *p;
2156
2157         afinfo->seq_ops.start           = tcp_seq_start;
2158         afinfo->seq_ops.next            = tcp_seq_next;
2159         afinfo->seq_ops.stop            = tcp_seq_stop;
2160
2161         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2162                              afinfo->seq_fops, afinfo);
2163         if (!p)
2164                 rc = -ENOMEM;
2165         return rc;
2166 }
2167 EXPORT_SYMBOL(tcp_proc_register);
2168
2169 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2170 {
2171         remove_proc_entry(afinfo->name, net->proc_net);
2172 }
2173 EXPORT_SYMBOL(tcp_proc_unregister);
2174
2175 static void get_openreq4(const struct request_sock *req,
2176                          struct seq_file *f, int i)
2177 {
2178         const struct inet_request_sock *ireq = inet_rsk(req);
2179         long delta = req->rsk_timer.expires - jiffies;
2180
2181         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2182                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2183                 i,
2184                 ireq->ir_loc_addr,
2185                 ireq->ir_num,
2186                 ireq->ir_rmt_addr,
2187                 ntohs(ireq->ir_rmt_port),
2188                 TCP_SYN_RECV,
2189                 0, 0, /* could print option size, but that is af dependent. */
2190                 1,    /* timers active (only the expire timer) */
2191                 jiffies_delta_to_clock_t(delta),
2192                 req->num_timeout,
2193                 from_kuid_munged(seq_user_ns(f),
2194                                  sock_i_uid(req->rsk_listener)),
2195                 0,  /* non standard timer */
2196                 0, /* open_requests have no inode */
2197                 0,
2198                 req);
2199 }
2200
2201 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2202 {
2203         int timer_active;
2204         unsigned long timer_expires;
2205         const struct tcp_sock *tp = tcp_sk(sk);
2206         const struct inet_connection_sock *icsk = inet_csk(sk);
2207         const struct inet_sock *inet = inet_sk(sk);
2208         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2209         __be32 dest = inet->inet_daddr;
2210         __be32 src = inet->inet_rcv_saddr;
2211         __u16 destp = ntohs(inet->inet_dport);
2212         __u16 srcp = ntohs(inet->inet_sport);
2213         int rx_queue;
2214         int state;
2215
2216         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2217             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2218             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2219                 timer_active    = 1;
2220                 timer_expires   = icsk->icsk_timeout;
2221         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2222                 timer_active    = 4;
2223                 timer_expires   = icsk->icsk_timeout;
2224         } else if (timer_pending(&sk->sk_timer)) {
2225                 timer_active    = 2;
2226                 timer_expires   = sk->sk_timer.expires;
2227         } else {
2228                 timer_active    = 0;
2229                 timer_expires = jiffies;
2230         }
2231
2232         state = sk_state_load(sk);
2233         if (state == TCP_LISTEN)
2234                 rx_queue = sk->sk_ack_backlog;
2235         else
2236                 /* Because we don't lock the socket,
2237                  * we might find a transient negative value.
2238                  */
2239                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2240
2241         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2242                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2243                 i, src, srcp, dest, destp, state,
2244                 tp->write_seq - tp->snd_una,
2245                 rx_queue,
2246                 timer_active,
2247                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2248                 icsk->icsk_retransmits,
2249                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2250                 icsk->icsk_probes_out,
2251                 sock_i_ino(sk),
2252                 atomic_read(&sk->sk_refcnt), sk,
2253                 jiffies_to_clock_t(icsk->icsk_rto),
2254                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2255                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2256                 tp->snd_cwnd,
2257                 state == TCP_LISTEN ?
2258                     fastopenq->max_qlen :
2259                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2260 }
2261
2262 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2263                                struct seq_file *f, int i)
2264 {
2265         long delta = tw->tw_timer.expires - jiffies;
2266         __be32 dest, src;
2267         __u16 destp, srcp;
2268
2269         dest  = tw->tw_daddr;
2270         src   = tw->tw_rcv_saddr;
2271         destp = ntohs(tw->tw_dport);
2272         srcp  = ntohs(tw->tw_sport);
2273
2274         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2275                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2276                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2277                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2278                 atomic_read(&tw->tw_refcnt), tw);
2279 }
2280
2281 #define TMPSZ 150
2282
2283 static int tcp4_seq_show(struct seq_file *seq, void *v)
2284 {
2285         struct tcp_iter_state *st;
2286         struct sock *sk = v;
2287
2288         seq_setwidth(seq, TMPSZ - 1);
2289         if (v == SEQ_START_TOKEN) {
2290                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2291                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2292                            "inode");
2293                 goto out;
2294         }
2295         st = seq->private;
2296
2297         if (sk->sk_state == TCP_TIME_WAIT)
2298                 get_timewait4_sock(v, seq, st->num);
2299         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2300                 get_openreq4(v, seq, st->num);
2301         else
2302                 get_tcp4_sock(v, seq, st->num);
2303 out:
2304         seq_pad(seq, '\n');
2305         return 0;
2306 }
2307
2308 static const struct file_operations tcp_afinfo_seq_fops = {
2309         .owner   = THIS_MODULE,
2310         .open    = tcp_seq_open,
2311         .read    = seq_read,
2312         .llseek  = seq_lseek,
2313         .release = seq_release_net
2314 };
2315
2316 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2317         .name           = "tcp",
2318         .family         = AF_INET,
2319         .seq_fops       = &tcp_afinfo_seq_fops,
2320         .seq_ops        = {
2321                 .show           = tcp4_seq_show,
2322         },
2323 };
2324
2325 static int __net_init tcp4_proc_init_net(struct net *net)
2326 {
2327         return tcp_proc_register(net, &tcp4_seq_afinfo);
2328 }
2329
2330 static void __net_exit tcp4_proc_exit_net(struct net *net)
2331 {
2332         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2333 }
2334
2335 static struct pernet_operations tcp4_net_ops = {
2336         .init = tcp4_proc_init_net,
2337         .exit = tcp4_proc_exit_net,
2338 };
2339
2340 int __init tcp4_proc_init(void)
2341 {
2342         return register_pernet_subsys(&tcp4_net_ops);
2343 }
2344
2345 void tcp4_proc_exit(void)
2346 {
2347         unregister_pernet_subsys(&tcp4_net_ops);
2348 }
2349 #endif /* CONFIG_PROC_FS */
2350
2351 struct proto tcp_prot = {
2352         .name                   = "TCP",
2353         .owner                  = THIS_MODULE,
2354         .close                  = tcp_close,
2355         .connect                = tcp_v4_connect,
2356         .disconnect             = tcp_disconnect,
2357         .accept                 = inet_csk_accept,
2358         .ioctl                  = tcp_ioctl,
2359         .init                   = tcp_v4_init_sock,
2360         .destroy                = tcp_v4_destroy_sock,
2361         .shutdown               = tcp_shutdown,
2362         .setsockopt             = tcp_setsockopt,
2363         .getsockopt             = tcp_getsockopt,
2364         .recvmsg                = tcp_recvmsg,
2365         .sendmsg                = tcp_sendmsg,
2366         .sendpage               = tcp_sendpage,
2367         .backlog_rcv            = tcp_v4_do_rcv,
2368         .release_cb             = tcp_release_cb,
2369         .hash                   = inet_hash,
2370         .unhash                 = inet_unhash,
2371         .get_port               = inet_csk_get_port,
2372         .enter_memory_pressure  = tcp_enter_memory_pressure,
2373         .stream_memory_free     = tcp_stream_memory_free,
2374         .sockets_allocated      = &tcp_sockets_allocated,
2375         .orphan_count           = &tcp_orphan_count,
2376         .memory_allocated       = &tcp_memory_allocated,
2377         .memory_pressure        = &tcp_memory_pressure,
2378         .sysctl_mem             = sysctl_tcp_mem,
2379         .sysctl_wmem            = sysctl_tcp_wmem,
2380         .sysctl_rmem            = sysctl_tcp_rmem,
2381         .max_header             = MAX_TCP_HEADER,
2382         .obj_size               = sizeof(struct tcp_sock),
2383         .slab_flags             = SLAB_DESTROY_BY_RCU,
2384         .twsk_prot              = &tcp_timewait_sock_ops,
2385         .rsk_prot               = &tcp_request_sock_ops,
2386         .h.hashinfo             = &tcp_hashinfo,
2387         .no_autobind            = true,
2388 #ifdef CONFIG_COMPAT
2389         .compat_setsockopt      = compat_tcp_setsockopt,
2390         .compat_getsockopt      = compat_tcp_getsockopt,
2391 #endif
2392         .diag_destroy           = tcp_abort,
2393 };
2394 EXPORT_SYMBOL(tcp_prot);
2395
2396 static void __net_exit tcp_sk_exit(struct net *net)
2397 {
2398         int cpu;
2399
2400         for_each_possible_cpu(cpu)
2401                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2402         free_percpu(net->ipv4.tcp_sk);
2403 }
2404
2405 static int __net_init tcp_sk_init(struct net *net)
2406 {
2407         int res, cpu;
2408
2409         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2410         if (!net->ipv4.tcp_sk)
2411                 return -ENOMEM;
2412
2413         for_each_possible_cpu(cpu) {
2414                 struct sock *sk;
2415
2416                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2417                                            IPPROTO_TCP, net);
2418                 if (res)
2419                         goto fail;
2420                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2421                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2422         }
2423
2424         net->ipv4.sysctl_tcp_ecn = 2;
2425         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2426
2427         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2428         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2429         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2430
2431         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2432         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2433         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2434
2435         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2436         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2437         net->ipv4.sysctl_tcp_syncookies = 1;
2438         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2439         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2440         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2441         net->ipv4.sysctl_tcp_orphan_retries = 0;
2442         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2443         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2444
2445         return 0;
2446 fail:
2447         tcp_sk_exit(net);
2448
2449         return res;
2450 }
2451
2452 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2453 {
2454         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2455 }
2456
2457 static struct pernet_operations __net_initdata tcp_sk_ops = {
2458        .init       = tcp_sk_init,
2459        .exit       = tcp_sk_exit,
2460        .exit_batch = tcp_sk_exit_batch,
2461 };
2462
2463 void __init tcp_v4_init(void)
2464 {
2465         inet_hashinfo_init(&tcp_hashinfo);
2466         if (register_pernet_subsys(&tcp_sk_ops))
2467                 panic("Failed to create the TCP control socket.\n");
2468 }