net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 static int tcp_repair_connect(struct sock *sk)
 142 {
 143         tcp_connect_init(sk);
 144         tcp_finish_connect(sk, NULL);
 145
 146         return 0;
 147 }
 148
 149 /* This will initiate an outgoing connection. */
 150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 151 {
 152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153         struct inet_sock *inet = inet_sk(sk);
 154         struct tcp_sock *tp = tcp_sk(sk);
 155         __be16 orig_sport, orig_dport;
 156         __be32 daddr, nexthop;
 157         struct flowi4 *fl4;
 158         struct rtable *rt;
 159         int err;
 160         struct ip_options_rcu *inet_opt;
 161
 162         if (addr_len < sizeof(struct sockaddr_in))
 163                 return -EINVAL;
 164
 165         if (usin->sin_family != AF_INET)
 166                 return -EAFNOSUPPORT;
 167
 168         nexthop = daddr = usin->sin_addr.s_addr;
 169         inet_opt = rcu_dereference_protected(inet->inet_opt,
 170                                              sock_owned_by_user(sk));
 171         if (inet_opt && inet_opt->opt.srr) {
 172                 if (!daddr)
 173                         return -EINVAL;
 174                 nexthop = inet_opt->opt.faddr;
 175         }
 176
 177         orig_sport = inet->inet_sport;
 178         orig_dport = usin->sin_port;
 179         fl4 = &inet->cork.fl.u.ip4;
 180         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 181                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 182                               IPPROTO_TCP,
 183                               orig_sport, orig_dport, sk, true);
 184         if (IS_ERR(rt)) {
 185                 err = PTR_ERR(rt);
 186                 if (err == -ENETUNREACH)
 187                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 188                 return err;
 189         }
 190
 191         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 192                 ip_rt_put(rt);
 193                 return -ENETUNREACH;
 194         }
 195
 196         if (!inet_opt || !inet_opt->opt.srr)
 197                 daddr = fl4->daddr;
 198
 199         if (!inet->inet_saddr)
 200                 inet->inet_saddr = fl4->saddr;
 201         inet->inet_rcv_saddr = inet->inet_saddr;
 202
 203         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 204                 /* Reset inherited state */
 205                 tp->rx_opt.ts_recent       = 0;
 206                 tp->rx_opt.ts_recent_stamp = 0;
 207                 if (likely(!tp->repair))
 208                         tp->write_seq      = 0;
 209         }
 210
 211         if (tcp_death_row.sysctl_tw_recycle &&
 212             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 213                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 214
 215         inet->inet_dport = usin->sin_port;
 216         inet->inet_daddr = daddr;
 217
 218         inet_csk(sk)->icsk_ext_hdr_len = 0;
 219         if (inet_opt)
 220                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 221
 222         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 223
 224         /* Socket identity is still unknown (sport may be zero).
 225          * However we set state to SYN-SENT and not releasing socket
 226          * lock select source port, enter ourselves into the hash tables and
 227          * complete initialization after this.
 228          */
 229         tcp_set_state(sk, TCP_SYN_SENT);
 230         err = inet_hash_connect(&tcp_death_row, sk);
 231         if (err)
 232                 goto failure;
 233
 234         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 235                                inet->inet_sport, inet->inet_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 rt = NULL;
 239                 goto failure;
 240         }
 241         /* OK, now commit destination to socket.  */
 242         sk->sk_gso_type = SKB_GSO_TCPV4;
 243         sk_setup_caps(sk, &rt->dst);
 244
 245         if (!tp->write_seq && likely(!tp->repair))
 246                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                            inet->inet_daddr,
 248                                                            inet->inet_sport,
 249                                                            usin->sin_port);
 250
 251         inet->inet_id = tp->write_seq ^ jiffies;
 252
 253         if (likely(!tp->repair))
 254                 err = tcp_connect(sk);
 255         else
 256                 err = tcp_repair_connect(sk);
 257
 258         rt = NULL;
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->inet_dport = 0;
 273         return err;
 274 }
 275 EXPORT_SYMBOL(tcp_v4_connect);
 276
 277 /*
 278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279  * It can be called through tcp_release_cb() if socket was owned by user
 280  * at the time tcp_v4_err() was called to handle ICMP message.
 281  */
 282 static void tcp_v4_mtu_reduced(struct sock *sk)
 283 {
 284         struct dst_entry *dst;
 285         struct inet_sock *inet = inet_sk(sk);
 286         u32 mtu = tcp_sk(sk)->mtu_info;
 287
 288         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 289          * send out by Linux are always <576bytes so they should go through
 290          * unfragmented).
 291          */
 292         if (sk->sk_state == TCP_LISTEN)
 293                 return;
 294
 295         dst = inet_csk_update_pmtu(sk, mtu);
 296         if (!dst)
 297                 return;
 298
 299         /* Something is about to be wrong... Remember soft error
 300          * for the case, if this connection will not able to recover.
 301          */
 302         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 303                 sk->sk_err_soft = EMSGSIZE;
 304
 305         mtu = dst_mtu(dst);
 306
 307         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 308             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 309                 tcp_sync_mss(sk, mtu);
 310
 311                 /* Resend the TCP packet because it's
 312                  * clear that the old packet has been
 313                  * dropped. This is the new "fast" path mtu
 314                  * discovery.
 315                  */
 316                 tcp_simple_retransmit(sk);
 317         } /* else let the usual retransmit timer handle it */
 318 }
 319
 320 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 321 {
 322         struct dst_entry *dst = __sk_dst_check(sk, 0);
 323
 324         if (dst)
 325                 dst->ops->redirect(dst, sk, skb);
 326 }
 327
 328 /*
 329  * This routine is called by the ICMP module when it gets some
 330  * sort of error condition.  If err < 0 then the socket should
 331  * be closed and the error returned to the user.  If err > 0
 332  * it's just the icmp type << 8 | icmp code.  After adjustment
 333  * header points to the first 8 bytes of the tcp header.  We need
 334  * to find the appropriate port.
 335  *
 336  * The locking strategy used here is very "optimistic". When
 337  * someone else accesses the socket the ICMP is just dropped
 338  * and for some paths there is no check at all.
 339  * A more general error queue to queue errors for later handling
 340  * is probably better.
 341  *
 342  */
 343
 344 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 345 {
 346         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 347         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 348         struct inet_connection_sock *icsk;
 349         struct tcp_sock *tp;
 350         struct inet_sock *inet;
 351         const int type = icmp_hdr(icmp_skb)->type;
 352         const int code = icmp_hdr(icmp_skb)->code;
 353         struct sock *sk;
 354         struct sk_buff *skb;
 355         struct request_sock *req;
 356         __u32 seq;
 357         __u32 remaining;
 358         int err;
 359         struct net *net = dev_net(icmp_skb->dev);
 360
 361         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 362                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 363                 return;
 364         }
 365
 366         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 367                         iph->saddr, th->source, inet_iif(icmp_skb));
 368         if (!sk) {
 369                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 370                 return;
 371         }
 372         if (sk->sk_state == TCP_TIME_WAIT) {
 373                 inet_twsk_put(inet_twsk(sk));
 374                 return;
 375         }
 376
 377         bh_lock_sock(sk);
 378         /* If too many ICMPs get dropped on busy
 379          * servers this needs to be solved differently.
 380          * We do take care of PMTU discovery (RFC1191) special case :
 381          * we can receive locally generated ICMP messages while socket is held.
 382          */
 383         if (sock_owned_by_user(sk) &&
 384             type != ICMP_DEST_UNREACH &&
 385             code != ICMP_FRAG_NEEDED)
 386                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 387
 388         if (sk->sk_state == TCP_CLOSE)
 389                 goto out;
 390
 391         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 392                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 393                 goto out;
 394         }
 395
 396         icsk = inet_csk(sk);
 397         tp = tcp_sk(sk);
 398         req = tp->fastopen_rsk;
 399         seq = ntohl(th->seq);
 400         if (sk->sk_state != TCP_LISTEN &&
 401             !between(seq, tp->snd_una, tp->snd_nxt) &&
 402             (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
 403                 /* For a Fast Open socket, allow seq to be snt_isn. */
 404                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 405                 goto out;
 406         }
 407
 408         switch (type) {
 409         case ICMP_REDIRECT:
 410                 do_redirect(icmp_skb, sk);
 411                 goto out;
 412         case ICMP_SOURCE_QUENCH:
 413                 /* Just silently ignore these. */
 414                 goto out;
 415         case ICMP_PARAMETERPROB:
 416                 err = EPROTO;
 417                 break;
 418         case ICMP_DEST_UNREACH:
 419                 if (code > NR_ICMP_UNREACH)
 420                         goto out;
 421
 422                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 423                         tp->mtu_info = info;
 424                         if (!sock_owned_by_user(sk)) {
 425                                 tcp_v4_mtu_reduced(sk);
 426                         } else {
 427                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 428                                         sock_hold(sk);
 429                         }
 430                         goto out;
 431                 }
 432
 433                 err = icmp_err_convert[code].errno;
 434                 /* check if icmp_skb allows revert of backoff
 435                  * (see draft-zimmermann-tcp-lcd) */
 436                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 437                         break;
 438                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 439                     !icsk->icsk_backoff)
 440                         break;
 441
 442                 /* XXX (TFO) - revisit the following logic for TFO */
 443
 444                 if (sock_owned_by_user(sk))
 445                         break;
 446
 447                 icsk->icsk_backoff--;
 448                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 449                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 450                 tcp_bound_rto(sk);
 451
 452                 skb = tcp_write_queue_head(sk);
 453                 BUG_ON(!skb);
 454
 455                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 456                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 457
 458                 if (remaining) {
 459                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 460                                                   remaining, TCP_RTO_MAX);
 461                 } else {
 462                         /* RTO revert clocked out retransmission.
 463                          * Will retransmit now */
 464                         tcp_retransmit_timer(sk);
 465                 }
 466
 467                 break;
 468         case ICMP_TIME_EXCEEDED:
 469                 err = EHOSTUNREACH;
 470                 break;
 471         default:
 472                 goto out;
 473         }
 474
 475         /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
 476          * than following the TCP_SYN_RECV case and closing the socket,
 477          * we ignore the ICMP error and keep trying like a fully established
 478          * socket. Is this the right thing to do?
 479          */
 480         if (req && req->sk == NULL)
 481                 goto out;
 482
 483         switch (sk->sk_state) {
 484                 struct request_sock *req, **prev;
 485         case TCP_LISTEN:
 486                 if (sock_owned_by_user(sk))
 487                         goto out;
 488
 489                 req = inet_csk_search_req(sk, &prev, th->dest,
 490                                           iph->daddr, iph->saddr);
 491                 if (!req)
 492                         goto out;
 493
 494                 /* ICMPs are not backlogged, hence we cannot get
 495                    an established socket here.
 496                  */
 497                 WARN_ON(req->sk);
 498
 499                 if (seq != tcp_rsk(req)->snt_isn) {
 500                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 501                         goto out;
 502                 }
 503
 504                 /*
 505                  * Still in SYN_RECV, just remove it silently.
 506                  * There is no good way to pass the error to the newly
 507                  * created socket, and POSIX does not want network
 508                  * errors returned from accept().
 509                  */
 510                 inet_csk_reqsk_queue_drop(sk, req, prev);
 511                 goto out;
 512
 513         case TCP_SYN_SENT:
 514         case TCP_SYN_RECV:  /* Cannot happen.
 515                                It can f.e. if SYNs crossed,
 516                                or Fast Open.
 517                              */
 518                 if (!sock_owned_by_user(sk)) {
 519                         sk->sk_err = err;
 520
 521                         sk->sk_error_report(sk);
 522
 523                         tcp_done(sk);
 524                 } else {
 525                         sk->sk_err_soft = err;
 526                 }
 527                 goto out;
 528         }
 529
 530         /* If we've already connected we will keep trying
 531          * until we time out, or the user gives up.
 532          *
 533          * rfc1122 4.2.3.9 allows to consider as hard errors
 534          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 535          * but it is obsoleted by pmtu discovery).
 536          *
 537          * Note, that in modern internet, where routing is unreliable
 538          * and in each dark corner broken firewalls sit, sending random
 539          * errors ordered by their masters even this two messages finally lose
 540          * their original sense (even Linux sends invalid PORT_UNREACHs)
 541          *
 542          * Now we are in compliance with RFCs.
 543          *                                                      --ANK (980905)
 544          */
 545
 546         inet = inet_sk(sk);
 547         if (!sock_owned_by_user(sk) && inet->recverr) {
 548                 sk->sk_err = err;
 549                 sk->sk_error_report(sk);
 550         } else  { /* Only an error on timeout */
 551                 sk->sk_err_soft = err;
 552         }
 553
 554 out:
 555         bh_unlock_sock(sk);
 556         sock_put(sk);
 557 }
 558
 559 static void __tcp_v4_send_check(struct sk_buff *skb,
 560                                 __be32 saddr, __be32 daddr)
 561 {
 562         struct tcphdr *th = tcp_hdr(skb);
 563
 564         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 565                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 566                 skb->csum_start = skb_transport_header(skb) - skb->head;
 567                 skb->csum_offset = offsetof(struct tcphdr, check);
 568         } else {
 569                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 570                                          csum_partial(th,
 571                                                       th->doff << 2,
 572                                                       skb->csum));
 573         }
 574 }
 575
 576 /* This routine computes an IPv4 TCP checksum. */
 577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 578 {
 579         const struct inet_sock *inet = inet_sk(sk);
 580
 581         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 582 }
 583 EXPORT_SYMBOL(tcp_v4_send_check);
 584
 585 int tcp_v4_gso_send_check(struct sk_buff *skb)
 586 {
 587         const struct iphdr *iph;
 588         struct tcphdr *th;
 589
 590         if (!pskb_may_pull(skb, sizeof(*th)))
 591                 return -EINVAL;
 592
 593         iph = ip_hdr(skb);
 594         th = tcp_hdr(skb);
 595
 596         th->check = 0;
 597         skb->ip_summed = CHECKSUM_PARTIAL;
 598         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 599         return 0;
 600 }
 601
 602 /*
 603  *      This routine will send an RST to the other tcp.
 604  *
 605  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 606  *                    for reset.
 607  *      Answer: if a packet caused RST, it is not for a socket
 608  *              existing in our system, if it is matched to a socket,
 609  *              it is just duplicate segment or bug in other side's TCP.
 610  *              So that we build reply only basing on parameters
 611  *              arrived with segment.
 612  *      Exception: precedence violation. We do not implement it in any case.
 613  */
 614
 615 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 616 {
 617         const struct tcphdr *th = tcp_hdr(skb);
 618         struct {
 619                 struct tcphdr th;
 620 #ifdef CONFIG_TCP_MD5SIG
 621                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 622 #endif
 623         } rep;
 624         struct ip_reply_arg arg;
 625 #ifdef CONFIG_TCP_MD5SIG
 626         struct tcp_md5sig_key *key;
 627         const __u8 *hash_location = NULL;
 628         unsigned char newhash[16];
 629         int genhash;
 630         struct sock *sk1 = NULL;
 631 #endif
 632         struct net *net;
 633
 634         /* Never send a reset in response to a reset. */
 635         if (th->rst)
 636                 return;
 637
 638         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 639                 return;
 640
 641         /* Swap the send and the receive. */
 642         memset(&rep, 0, sizeof(rep));
 643         rep.th.dest   = th->source;
 644         rep.th.source = th->dest;
 645         rep.th.doff   = sizeof(struct tcphdr) / 4;
 646         rep.th.rst    = 1;
 647
 648         if (th->ack) {
 649                 rep.th.seq = th->ack_seq;
 650         } else {
 651                 rep.th.ack = 1;
 652                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 653                                        skb->len - (th->doff << 2));
 654         }
 655
 656         memset(&arg, 0, sizeof(arg));
 657         arg.iov[0].iov_base = (unsigned char *)&rep;
 658         arg.iov[0].iov_len  = sizeof(rep.th);
 659
 660 #ifdef CONFIG_TCP_MD5SIG
 661         hash_location = tcp_parse_md5sig_option(th);
 662         if (!sk && hash_location) {
 663                 /*
 664                  * active side is lost. Try to find listening socket through
 665                  * source port, and then find md5 key through listening socket.
 666                  * we are not loose security here:
 667                  * Incoming packet is checked with md5 hash with finding key,
 668                  * no RST generated if md5 hash doesn't match.
 669                  */
 670                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 671                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
 672                                              ntohs(th->source), inet_iif(skb));
 673                 /* don't send rst if it can't find key */
 674                 if (!sk1)
 675                         return;
 676                 rcu_read_lock();
 677                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 678                                         &ip_hdr(skb)->saddr, AF_INET);
 679                 if (!key)
 680                         goto release_sk1;
 681
 682                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 683                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 684                         goto release_sk1;
 685         } else {
 686                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 687                                              &ip_hdr(skb)->saddr,
 688                                              AF_INET) : NULL;
 689         }
 690
 691         if (key) {
 692                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 693                                    (TCPOPT_NOP << 16) |
 694                                    (TCPOPT_MD5SIG << 8) |
 695                                    TCPOLEN_MD5SIG);
 696                 /* Update length and the length the header thinks exists */
 697                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 698                 rep.th.doff = arg.iov[0].iov_len / 4;
 699
 700                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 701                                      key, ip_hdr(skb)->saddr,
 702                                      ip_hdr(skb)->daddr, &rep.th);
 703         }
 704 #endif
 705         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 706                                       ip_hdr(skb)->saddr, /* XXX */
 707                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 708         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 709         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 710         /* When socket is gone, all binding information is lost.
 711          * routing might fail in this case. No choice here, if we choose to force
 712          * input interface, we will misroute in case of asymmetric route.
 713          */
 714         if (sk)
 715                 arg.bound_dev_if = sk->sk_bound_dev_if;
 716
 717         net = dev_net(skb_dst(skb)->dev);
 718         arg.tos = ip_hdr(skb)->tos;
 719         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 720                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 721
 722         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 723         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 724
 725 #ifdef CONFIG_TCP_MD5SIG
 726 release_sk1:
 727         if (sk1) {
 728                 rcu_read_unlock();
 729                 sock_put(sk1);
 730         }
 731 #endif
 732 }
 733
 734 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 735    outside socket context is ugly, certainly. What can I do?
 736  */
 737
 738 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 739                             u32 win, u32 ts, int oif,
 740                             struct tcp_md5sig_key *key,
 741                             int reply_flags, u8 tos)
 742 {
 743         const struct tcphdr *th = tcp_hdr(skb);
 744         struct {
 745                 struct tcphdr th;
 746                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 747 #ifdef CONFIG_TCP_MD5SIG
 748                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 749 #endif
 750                         ];
 751         } rep;
 752         struct ip_reply_arg arg;
 753         struct net *net = dev_net(skb_dst(skb)->dev);
 754
 755         memset(&rep.th, 0, sizeof(struct tcphdr));
 756         memset(&arg, 0, sizeof(arg));
 757
 758         arg.iov[0].iov_base = (unsigned char *)&rep;
 759         arg.iov[0].iov_len  = sizeof(rep.th);
 760         if (ts) {
 761                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 762                                    (TCPOPT_TIMESTAMP << 8) |
 763                                    TCPOLEN_TIMESTAMP);
 764                 rep.opt[1] = htonl(tcp_time_stamp);
 765                 rep.opt[2] = htonl(ts);
 766                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 767         }
 768
 769         /* Swap the send and the receive. */
 770         rep.th.dest    = th->source;
 771         rep.th.source  = th->dest;
 772         rep.th.doff    = arg.iov[0].iov_len / 4;
 773         rep.th.seq     = htonl(seq);
 774         rep.th.ack_seq = htonl(ack);
 775         rep.th.ack     = 1;
 776         rep.th.window  = htons(win);
 777
 778 #ifdef CONFIG_TCP_MD5SIG
 779         if (key) {
 780                 int offset = (ts) ? 3 : 0;
 781
 782                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 783                                           (TCPOPT_NOP << 16) |
 784                                           (TCPOPT_MD5SIG << 8) |
 785                                           TCPOLEN_MD5SIG);
 786                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 787                 rep.th.doff = arg.iov[0].iov_len/4;
 788
 789                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 790                                     key, ip_hdr(skb)->saddr,
 791                                     ip_hdr(skb)->daddr, &rep.th);
 792         }
 793 #endif
 794         arg.flags = reply_flags;
 795         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 796                                       ip_hdr(skb)->saddr, /* XXX */
 797                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 798         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 799         if (oif)
 800                 arg.bound_dev_if = oif;
 801         arg.tos = tos;
 802         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 803                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 804
 805         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 806 }
 807
 808 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 809 {
 810         struct inet_timewait_sock *tw = inet_twsk(sk);
 811         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 812
 813         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 814                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 815                         tcptw->tw_ts_recent,
 816                         tw->tw_bound_dev_if,
 817                         tcp_twsk_md5_key(tcptw),
 818                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 819                         tw->tw_tos
 820                         );
 821
 822         inet_twsk_put(tw);
 823 }
 824
 825 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 826                                   struct request_sock *req)
 827 {
 828         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 829          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 830          */
 831         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 832                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 833                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 834                         req->ts_recent,
 835                         0,
 836                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 837                                           AF_INET),
 838                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 839                         ip_hdr(skb)->tos);
 840 }
 841
 842 /*
 843  *      Send a SYN-ACK after having received a SYN.
 844  *      This still operates on a request_sock only, not on a big
 845  *      socket.
 846  */
 847 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 848                               struct request_sock *req,
 849                               struct request_values *rvp,
 850                               u16 queue_mapping,
 851                               bool nocache)
 852 {
 853         const struct inet_request_sock *ireq = inet_rsk(req);
 854         struct flowi4 fl4;
 855         int err = -1;
 856         struct sk_buff * skb;
 857
 858         /* First, grab a route. */
 859         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 860                 return -1;
 861
 862         skb = tcp_make_synack(sk, dst, req, rvp, NULL);
 863
 864         if (skb) {
 865                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 866
 867                 skb_set_queue_mapping(skb, queue_mapping);
 868                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 869                                             ireq->rmt_addr,
 870                                             ireq->opt);
 871                 err = net_xmit_eval(err);
 872                 if (!tcp_rsk(req)->snt_synack && !err)
 873                         tcp_rsk(req)->snt_synack = tcp_time_stamp;
 874         }
 875
 876         return err;
 877 }
 878
 879 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 880                               struct request_values *rvp)
 881 {
 882         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 883         return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 884 }
 885
 886 /*
 887  *      IPv4 request_sock destructor.
 888  */
 889 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 890 {
 891         kfree(inet_rsk(req)->opt);
 892 }
 893
 894 /*
 895  * Return true if a syncookie should be sent
 896  */
 897 bool tcp_syn_flood_action(struct sock *sk,
 898                          const struct sk_buff *skb,
 899                          const char *proto)
 900 {
 901         const char *msg = "Dropping request";
 902         bool want_cookie = false;
 903         struct listen_sock *lopt;
 904
 905
 906
 907 #ifdef CONFIG_SYN_COOKIES
 908         if (sysctl_tcp_syncookies) {
 909                 msg = "Sending cookies";
 910                 want_cookie = true;
 911                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 912         } else
 913 #endif
 914                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 915
 916         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 917         if (!lopt->synflood_warned) {
 918                 lopt->synflood_warned = 1;
 919                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 920                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 921         }
 922         return want_cookie;
 923 }
 924 EXPORT_SYMBOL(tcp_syn_flood_action);
 925
 926 /*
 927  * Save and compile IPv4 options into the request_sock if needed.
 928  */
 929 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 930 {
 931         const struct ip_options *opt = &(IPCB(skb)->opt);
 932         struct ip_options_rcu *dopt = NULL;
 933
 934         if (opt && opt->optlen) {
 935                 int opt_size = sizeof(*dopt) + opt->optlen;
 936
 937                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 938                 if (dopt) {
 939                         if (ip_options_echo(&dopt->opt, skb)) {
 940                                 kfree(dopt);
 941                                 dopt = NULL;
 942                         }
 943                 }
 944         }
 945         return dopt;
 946 }
 947
 948 #ifdef CONFIG_TCP_MD5SIG
 949 /*
 950  * RFC2385 MD5 checksumming requires a mapping of
 951  * IP address->MD5 Key.
 952  * We need to maintain these in the sk structure.
 953  */
 954
 955 /* Find the Key structure for an address.  */
 956 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 957                                          const union tcp_md5_addr *addr,
 958                                          int family)
 959 {
 960         struct tcp_sock *tp = tcp_sk(sk);
 961         struct tcp_md5sig_key *key;
 962         struct hlist_node *pos;
 963         unsigned int size = sizeof(struct in_addr);
 964         struct tcp_md5sig_info *md5sig;
 965
 966         /* caller either holds rcu_read_lock() or socket lock */
 967         md5sig = rcu_dereference_check(tp->md5sig_info,
 968                                        sock_owned_by_user(sk) ||
 969                                        lockdep_is_held(&sk->sk_lock.slock));
 970         if (!md5sig)
 971                 return NULL;
 972 #if IS_ENABLED(CONFIG_IPV6)
 973         if (family == AF_INET6)
 974                 size = sizeof(struct in6_addr);
 975 #endif
 976         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 977                 if (key->family != family)
 978                         continue;
 979                 if (!memcmp(&key->addr, addr, size))
 980                         return key;
 981         }
 982         return NULL;
 983 }
 984 EXPORT_SYMBOL(tcp_md5_do_lookup);
 985
 986 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 987                                          struct sock *addr_sk)
 988 {
 989         union tcp_md5_addr *addr;
 990
 991         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 992         return tcp_md5_do_lookup(sk, addr, AF_INET);
 993 }
 994 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 995
 996 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 997                                                       struct request_sock *req)
 998 {
 999         union tcp_md5_addr *addr;
1000
1001         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
1002         return tcp_md5_do_lookup(sk, addr, AF_INET);
1003 }
1004
1005 /* This can be called on a newly created socket, from other files */
1006 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1007                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1008 {
1009         /* Add Key to the list */
1010         struct tcp_md5sig_key *key;
1011         struct tcp_sock *tp = tcp_sk(sk);
1012         struct tcp_md5sig_info *md5sig;
1013
1014         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1015         if (key) {
1016                 /* Pre-existing entry - just update that one. */
1017                 memcpy(key->key, newkey, newkeylen);
1018                 key->keylen = newkeylen;
1019                 return 0;
1020         }
1021
1022         md5sig = rcu_dereference_protected(tp->md5sig_info,
1023                                            sock_owned_by_user(sk));
1024         if (!md5sig) {
1025                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1026                 if (!md5sig)
1027                         return -ENOMEM;
1028
1029                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1030                 INIT_HLIST_HEAD(&md5sig->head);
1031                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1032         }
1033
1034         key = sock_kmalloc(sk, sizeof(*key), gfp);
1035         if (!key)
1036                 return -ENOMEM;
1037         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1038                 sock_kfree_s(sk, key, sizeof(*key));
1039                 return -ENOMEM;
1040         }
1041
1042         memcpy(key->key, newkey, newkeylen);
1043         key->keylen = newkeylen;
1044         key->family = family;
1045         memcpy(&key->addr, addr,
1046                (family == AF_INET6) ? sizeof(struct in6_addr) :
1047                                       sizeof(struct in_addr));
1048         hlist_add_head_rcu(&key->node, &md5sig->head);
1049         return 0;
1050 }
1051 EXPORT_SYMBOL(tcp_md5_do_add);
1052
1053 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1054 {
1055         struct tcp_sock *tp = tcp_sk(sk);
1056         struct tcp_md5sig_key *key;
1057         struct tcp_md5sig_info *md5sig;
1058
1059         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1060         if (!key)
1061                 return -ENOENT;
1062         hlist_del_rcu(&key->node);
1063         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1064         kfree_rcu(key, rcu);
1065         md5sig = rcu_dereference_protected(tp->md5sig_info,
1066                                            sock_owned_by_user(sk));
1067         if (hlist_empty(&md5sig->head))
1068                 tcp_free_md5sig_pool();
1069         return 0;
1070 }
1071 EXPORT_SYMBOL(tcp_md5_do_del);
1072
1073 void tcp_clear_md5_list(struct sock *sk)
1074 {
1075         struct tcp_sock *tp = tcp_sk(sk);
1076         struct tcp_md5sig_key *key;
1077         struct hlist_node *pos, *n;
1078         struct tcp_md5sig_info *md5sig;
1079
1080         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1081
1082         if (!hlist_empty(&md5sig->head))
1083                 tcp_free_md5sig_pool();
1084         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1085                 hlist_del_rcu(&key->node);
1086                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1087                 kfree_rcu(key, rcu);
1088         }
1089 }
1090
1091 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1092                                  int optlen)
1093 {
1094         struct tcp_md5sig cmd;
1095         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1096
1097         if (optlen < sizeof(cmd))
1098                 return -EINVAL;
1099
1100         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1101                 return -EFAULT;
1102
1103         if (sin->sin_family != AF_INET)
1104                 return -EINVAL;
1105
1106         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1107                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1108                                       AF_INET);
1109
1110         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1111                 return -EINVAL;
1112
1113         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1114                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1115                               GFP_KERNEL);
1116 }
1117
1118 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1119                                         __be32 daddr, __be32 saddr, int nbytes)
1120 {
1121         struct tcp4_pseudohdr *bp;
1122         struct scatterlist sg;
1123
1124         bp = &hp->md5_blk.ip4;
1125
1126         /*
1127          * 1. the TCP pseudo-header (in the order: source IP address,
1128          * destination IP address, zero-padded protocol number, and
1129          * segment length)
1130          */
1131         bp->saddr = saddr;
1132         bp->daddr = daddr;
1133         bp->pad = 0;
1134         bp->protocol = IPPROTO_TCP;
1135         bp->len = cpu_to_be16(nbytes);
1136
1137         sg_init_one(&sg, bp, sizeof(*bp));
1138         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1139 }
1140
1141 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1142                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1143 {
1144         struct tcp_md5sig_pool *hp;
1145         struct hash_desc *desc;
1146
1147         hp = tcp_get_md5sig_pool();
1148         if (!hp)
1149                 goto clear_hash_noput;
1150         desc = &hp->md5_desc;
1151
1152         if (crypto_hash_init(desc))
1153                 goto clear_hash;
1154         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1155                 goto clear_hash;
1156         if (tcp_md5_hash_header(hp, th))
1157                 goto clear_hash;
1158         if (tcp_md5_hash_key(hp, key))
1159                 goto clear_hash;
1160         if (crypto_hash_final(desc, md5_hash))
1161                 goto clear_hash;
1162
1163         tcp_put_md5sig_pool();
1164         return 0;
1165
1166 clear_hash:
1167         tcp_put_md5sig_pool();
1168 clear_hash_noput:
1169         memset(md5_hash, 0, 16);
1170         return 1;
1171 }
1172
1173 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1174                         const struct sock *sk, const struct request_sock *req,
1175                         const struct sk_buff *skb)
1176 {
1177         struct tcp_md5sig_pool *hp;
1178         struct hash_desc *desc;
1179         const struct tcphdr *th = tcp_hdr(skb);
1180         __be32 saddr, daddr;
1181
1182         if (sk) {
1183                 saddr = inet_sk(sk)->inet_saddr;
1184                 daddr = inet_sk(sk)->inet_daddr;
1185         } else if (req) {
1186                 saddr = inet_rsk(req)->loc_addr;
1187                 daddr = inet_rsk(req)->rmt_addr;
1188         } else {
1189                 const struct iphdr *iph = ip_hdr(skb);
1190                 saddr = iph->saddr;
1191                 daddr = iph->daddr;
1192         }
1193
1194         hp = tcp_get_md5sig_pool();
1195         if (!hp)
1196                 goto clear_hash_noput;
1197         desc = &hp->md5_desc;
1198
1199         if (crypto_hash_init(desc))
1200                 goto clear_hash;
1201
1202         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1203                 goto clear_hash;
1204         if (tcp_md5_hash_header(hp, th))
1205                 goto clear_hash;
1206         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1207                 goto clear_hash;
1208         if (tcp_md5_hash_key(hp, key))
1209                 goto clear_hash;
1210         if (crypto_hash_final(desc, md5_hash))
1211                 goto clear_hash;
1212
1213         tcp_put_md5sig_pool();
1214         return 0;
1215
1216 clear_hash:
1217         tcp_put_md5sig_pool();
1218 clear_hash_noput:
1219         memset(md5_hash, 0, 16);
1220         return 1;
1221 }
1222 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1223
1224 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1225 {
1226         /*
1227          * This gets called for each TCP segment that arrives
1228          * so we want to be efficient.
1229          * We have 3 drop cases:
1230          * o No MD5 hash and one expected.
1231          * o MD5 hash and we're not expecting one.
1232          * o MD5 hash and its wrong.
1233          */
1234         const __u8 *hash_location = NULL;
1235         struct tcp_md5sig_key *hash_expected;
1236         const struct iphdr *iph = ip_hdr(skb);
1237         const struct tcphdr *th = tcp_hdr(skb);
1238         int genhash;
1239         unsigned char newhash[16];
1240
1241         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1242                                           AF_INET);
1243         hash_location = tcp_parse_md5sig_option(th);
1244
1245         /* We've parsed the options - do we have a hash? */
1246         if (!hash_expected && !hash_location)
1247                 return false;
1248
1249         if (hash_expected && !hash_location) {
1250                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1251                 return true;
1252         }
1253
1254         if (!hash_expected && hash_location) {
1255                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1256                 return true;
1257         }
1258
1259         /* Okay, so this is hash_expected and hash_location -
1260          * so we need to calculate the checksum.
1261          */
1262         genhash = tcp_v4_md5_hash_skb(newhash,
1263                                       hash_expected,
1264                                       NULL, NULL, skb);
1265
1266         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1267                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1268                                      &iph->saddr, ntohs(th->source),
1269                                      &iph->daddr, ntohs(th->dest),
1270                                      genhash ? " tcp_v4_calc_md5_hash failed"
1271                                      : "");
1272                 return true;
1273         }
1274         return false;
1275 }
1276
1277 #endif
1278
1279 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1280         .family         =       PF_INET,
1281         .obj_size       =       sizeof(struct tcp_request_sock),
1282         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1283         .send_ack       =       tcp_v4_reqsk_send_ack,
1284         .destructor     =       tcp_v4_reqsk_destructor,
1285         .send_reset     =       tcp_v4_send_reset,
1286         .syn_ack_timeout =      tcp_syn_ack_timeout,
1287 };
1288
1289 #ifdef CONFIG_TCP_MD5SIG
1290 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1291         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1292         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1293 };
1294 #endif
1295
1296 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1297                                struct request_sock *req,
1298                                struct tcp_fastopen_cookie *foc,
1299                                struct tcp_fastopen_cookie *valid_foc)
1300 {
1301         bool skip_cookie = false;
1302         struct fastopen_queue *fastopenq;
1303
1304         if (likely(!fastopen_cookie_present(foc))) {
1305                 /* See include/net/tcp.h for the meaning of these knobs */
1306                 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1307                     ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1308                     (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1309                         skip_cookie = true; /* no cookie to validate */
1310                 else
1311                         return false;
1312         }
1313         fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1314         /* A FO option is present; bump the counter. */
1315         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1316
1317         /* Make sure the listener has enabled fastopen, and we don't
1318          * exceed the max # of pending TFO requests allowed before trying
1319          * to validating the cookie in order to avoid burning CPU cycles
1320          * unnecessarily.
1321          *
1322          * XXX (TFO) - The implication of checking the max_qlen before
1323          * processing a cookie request is that clients can't differentiate
1324          * between qlen overflow causing Fast Open to be disabled
1325          * temporarily vs a server not supporting Fast Open at all.
1326          */
1327         if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1328             fastopenq == NULL || fastopenq->max_qlen == 0)
1329                 return false;
1330
1331         if (fastopenq->qlen >= fastopenq->max_qlen) {
1332                 struct request_sock *req1;
1333                 spin_lock(&fastopenq->lock);
1334                 req1 = fastopenq->rskq_rst_head;
1335                 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1336                         spin_unlock(&fastopenq->lock);
1337                         NET_INC_STATS_BH(sock_net(sk),
1338                             LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1339                         /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1340                         foc->len = -1;
1341                         return false;
1342                 }
1343                 fastopenq->rskq_rst_head = req1->dl_next;
1344                 fastopenq->qlen--;
1345                 spin_unlock(&fastopenq->lock);
1346                 reqsk_free(req1);
1347         }
1348         if (skip_cookie) {
1349                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1350                 return true;
1351         }
1352         if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1353                 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1354                         tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1355                         if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1356                             memcmp(&foc->val[0], &valid_foc->val[0],
1357                             TCP_FASTOPEN_COOKIE_SIZE) != 0)
1358                                 return false;
1359                         valid_foc->len = -1;
1360                 }
1361                 /* Acknowledge the data received from the peer. */
1362                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1363                 return true;
1364         } else if (foc->len == 0) { /* Client requesting a cookie */
1365                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1366                 NET_INC_STATS_BH(sock_net(sk),
1367                     LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1368         } else {
1369                 /* Client sent a cookie with wrong size. Treat it
1370                  * the same as invalid and return a valid one.
1371                  */
1372                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1373         }
1374         return false;
1375 }
1376
1377 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1378                                     struct sk_buff *skb,
1379                                     struct sk_buff *skb_synack,
1380                                     struct request_sock *req,
1381                                     struct request_values *rvp)
1382 {
1383         struct tcp_sock *tp = tcp_sk(sk);
1384         struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1385         const struct inet_request_sock *ireq = inet_rsk(req);
1386         struct sock *child;
1387         int err;
1388
1389         req->retrans = 0;
1390         req->sk = NULL;
1391
1392         child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1393         if (child == NULL) {
1394                 NET_INC_STATS_BH(sock_net(sk),
1395                                  LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1396                 kfree_skb(skb_synack);
1397                 return -1;
1398         }
1399         err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1400                                     ireq->rmt_addr, ireq->opt);
1401         err = net_xmit_eval(err);
1402         if (!err)
1403                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1404         /* XXX (TFO) - is it ok to ignore error and continue? */
1405
1406         spin_lock(&queue->fastopenq->lock);
1407         queue->fastopenq->qlen++;
1408         spin_unlock(&queue->fastopenq->lock);
1409
1410         /* Initialize the child socket. Have to fix some values to take
1411          * into account the child is a Fast Open socket and is created
1412          * only out of the bits carried in the SYN packet.
1413          */
1414         tp = tcp_sk(child);
1415
1416         tp->fastopen_rsk = req;
1417         /* Do a hold on the listner sk so that if the listener is being
1418          * closed, the child that has been accepted can live on and still
1419          * access listen_lock.
1420          */
1421         sock_hold(sk);
1422         tcp_rsk(req)->listener = sk;
1423
1424         /* RFC1323: The window in SYN & SYN/ACK segments is never
1425          * scaled. So correct it appropriately.
1426          */
1427         tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1428
1429         /* Activate the retrans timer so that SYNACK can be retransmitted.
1430          * The request socket is not added to the SYN table of the parent
1431          * because it's been added to the accept queue directly.
1432          */
1433         inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1434             TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1435
1436         /* Add the child socket directly into the accept queue */
1437         inet_csk_reqsk_queue_add(sk, req, child);
1438
1439         /* Now finish processing the fastopen child socket. */
1440         inet_csk(child)->icsk_af_ops->rebuild_header(child);
1441         tcp_init_congestion_control(child);
1442         tcp_mtup_init(child);
1443         tcp_init_buffer_space(child);
1444         tcp_init_metrics(child);
1445
1446         /* Queue the data carried in the SYN packet. We need to first
1447          * bump skb's refcnt because the caller will attempt to free it.
1448          *
1449          * XXX (TFO) - we honor a zero-payload TFO request for now.
1450          * (Any reason not to?)
1451          */
1452         if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1453                 /* Don't queue the skb if there is no payload in SYN.
1454                  * XXX (TFO) - How about SYN+FIN?
1455                  */
1456                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1457         } else {
1458                 skb = skb_get(skb);
1459                 skb_dst_drop(skb);
1460                 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1461                 skb_set_owner_r(skb, child);
1462                 __skb_queue_tail(&child->sk_receive_queue, skb);
1463                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1464                 tp->syn_data_acked = 1;
1465         }
1466         sk->sk_data_ready(sk, 0);
1467         bh_unlock_sock(child);
1468         sock_put(child);
1469         WARN_ON(req->sk == NULL);
1470         return 0;
1471 }
1472
1473 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1474 {
1475         struct tcp_extend_values tmp_ext;
1476         struct tcp_options_received tmp_opt;
1477         const u8 *hash_location;
1478         struct request_sock *req;
1479         struct inet_request_sock *ireq;
1480         struct tcp_sock *tp = tcp_sk(sk);
1481         struct dst_entry *dst = NULL;
1482         __be32 saddr = ip_hdr(skb)->saddr;
1483         __be32 daddr = ip_hdr(skb)->daddr;
1484         __u32 isn = TCP_SKB_CB(skb)->when;
1485         bool want_cookie = false;
1486         struct flowi4 fl4;
1487         struct tcp_fastopen_cookie foc = { .len = -1 };
1488         struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1489         struct sk_buff *skb_synack;
1490         int do_fastopen;
1491
1492         /* Never answer to SYNs send to broadcast or multicast */
1493         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1494                 goto drop;
1495
1496         /* TW buckets are converted to open requests without
1497          * limitations, they conserve resources and peer is
1498          * evidently real one.
1499          */
1500         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1501                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1502                 if (!want_cookie)
1503                         goto drop;
1504         }
1505
1506         /* Accept backlog is full. If we have already queued enough
1507          * of warm entries in syn queue, drop request. It is better than
1508          * clogging syn queue with openreqs with exponentially increasing
1509          * timeout.
1510          */
1511         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1512                 goto drop;
1513
1514         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1515         if (!req)
1516                 goto drop;
1517
1518 #ifdef CONFIG_TCP_MD5SIG
1519         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1520 #endif
1521
1522         tcp_clear_options(&tmp_opt);
1523         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1524         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1525         tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1526             want_cookie ? NULL : &foc);
1527
1528         if (tmp_opt.cookie_plus > 0 &&
1529             tmp_opt.saw_tstamp &&
1530             !tp->rx_opt.cookie_out_never &&
1531             (sysctl_tcp_cookie_size > 0 ||
1532              (tp->cookie_values != NULL &&
1533               tp->cookie_values->cookie_desired > 0))) {
1534                 u8 *c;
1535                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1536                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1537
1538                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1539                         goto drop_and_release;
1540
1541                 /* Secret recipe starts with IP addresses */
1542                 *mess++ ^= (__force u32)daddr;
1543                 *mess++ ^= (__force u32)saddr;
1544
1545                 /* plus variable length Initiator Cookie */
1546                 c = (u8 *)mess;
1547                 while (l-- > 0)
1548                         *c++ ^= *hash_location++;
1549
1550                 want_cookie = false;    /* not our kind of cookie */
1551                 tmp_ext.cookie_out_never = 0; /* false */
1552                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1553         } else if (!tp->rx_opt.cookie_in_always) {
1554                 /* redundant indications, but ensure initialization. */
1555                 tmp_ext.cookie_out_never = 1; /* true */
1556                 tmp_ext.cookie_plus = 0;
1557         } else {
1558                 goto drop_and_release;
1559         }
1560         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1561
1562         if (want_cookie && !tmp_opt.saw_tstamp)
1563                 tcp_clear_options(&tmp_opt);
1564
1565         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1566         tcp_openreq_init(req, &tmp_opt, skb);
1567
1568         ireq = inet_rsk(req);
1569         ireq->loc_addr = daddr;
1570         ireq->rmt_addr = saddr;
1571         ireq->no_srccheck = inet_sk(sk)->transparent;
1572         ireq->opt = tcp_v4_save_options(skb);
1573
1574         if (security_inet_conn_request(sk, skb, req))
1575                 goto drop_and_free;
1576
1577         if (!want_cookie || tmp_opt.tstamp_ok)
1578                 TCP_ECN_create_request(req, skb);
1579
1580         if (want_cookie) {
1581                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1582                 req->cookie_ts = tmp_opt.tstamp_ok;
1583         } else if (!isn) {
1584                 /* VJ's idea. We save last timestamp seen
1585                  * from the destination in peer table, when entering
1586                  * state TIME-WAIT, and check against it before
1587                  * accepting new connection request.
1588                  *
1589                  * If "isn" is not zero, this request hit alive
1590                  * timewait bucket, so that all the necessary checks
1591                  * are made in the function processing timewait state.
1592                  */
1593                 if (tmp_opt.saw_tstamp &&
1594                     tcp_death_row.sysctl_tw_recycle &&
1595                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1596                     fl4.daddr == saddr) {
1597                         if (!tcp_peer_is_proven(req, dst, true)) {
1598                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1599                                 goto drop_and_release;
1600                         }
1601                 }
1602                 /* Kill the following clause, if you dislike this way. */
1603                 else if (!sysctl_tcp_syncookies &&
1604                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1605                           (sysctl_max_syn_backlog >> 2)) &&
1606                          !tcp_peer_is_proven(req, dst, false)) {
1607                         /* Without syncookies last quarter of
1608                          * backlog is filled with destinations,
1609                          * proven to be alive.
1610                          * It means that we continue to communicate
1611                          * to destinations, already remembered
1612                          * to the moment of synflood.
1613                          */
1614                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1615                                        &saddr, ntohs(tcp_hdr(skb)->source));
1616                         goto drop_and_release;
1617                 }
1618
1619                 isn = tcp_v4_init_sequence(skb);
1620         }
1621         tcp_rsk(req)->snt_isn = isn;
1622
1623         if (dst == NULL) {
1624                 dst = inet_csk_route_req(sk, &fl4, req);
1625                 if (dst == NULL)
1626                         goto drop_and_free;
1627         }
1628         do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1629
1630         /* We don't call tcp_v4_send_synack() directly because we need
1631          * to make sure a child socket can be created successfully before
1632          * sending back synack!
1633          *
1634          * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1635          * (or better yet, call tcp_send_synack() in the child context
1636          * directly, but will have to fix bunch of other code first)
1637          * after syn_recv_sock() except one will need to first fix the
1638          * latter to remove its dependency on the current implementation
1639          * of tcp_v4_send_synack()->tcp_select_initial_window().
1640          */
1641         skb_synack = tcp_make_synack(sk, dst, req,
1642             (struct request_values *)&tmp_ext,
1643             fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1644
1645         if (skb_synack) {
1646                 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1647                 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1648         } else
1649                 goto drop_and_free;
1650
1651         if (likely(!do_fastopen)) {
1652                 int err;
1653                 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1654                      ireq->rmt_addr, ireq->opt);
1655                 err = net_xmit_eval(err);
1656                 if (err || want_cookie)
1657                         goto drop_and_free;
1658
1659                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1660                 tcp_rsk(req)->listener = NULL;
1661                 /* Add the request_sock to the SYN table */
1662                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1663                 if (fastopen_cookie_present(&foc) && foc.len != 0)
1664                         NET_INC_STATS_BH(sock_net(sk),
1665                             LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1666         } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1667             (struct request_values *)&tmp_ext))
1668                 goto drop_and_free;
1669
1670         return 0;
1671
1672 drop_and_release:
1673         dst_release(dst);
1674 drop_and_free:
1675         reqsk_free(req);
1676 drop:
1677         return 0;
1678 }
1679 EXPORT_SYMBOL(tcp_v4_conn_request);
1680
1681
1682 /*
1683  * The three way handshake has completed - we got a valid synack -
1684  * now create the new socket.
1685  */
1686 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1687                                   struct request_sock *req,
1688                                   struct dst_entry *dst)
1689 {
1690         struct inet_request_sock *ireq;
1691         struct inet_sock *newinet;
1692         struct tcp_sock *newtp;
1693         struct sock *newsk;
1694 #ifdef CONFIG_TCP_MD5SIG
1695         struct tcp_md5sig_key *key;
1696 #endif
1697         struct ip_options_rcu *inet_opt;
1698
1699         if (sk_acceptq_is_full(sk))
1700                 goto exit_overflow;
1701
1702         newsk = tcp_create_openreq_child(sk, req, skb);
1703         if (!newsk)
1704                 goto exit_nonewsk;
1705
1706         newsk->sk_gso_type = SKB_GSO_TCPV4;
1707         inet_sk_rx_dst_set(newsk, skb);
1708
1709         newtp                 = tcp_sk(newsk);
1710         newinet               = inet_sk(newsk);
1711         ireq                  = inet_rsk(req);
1712         newinet->inet_daddr   = ireq->rmt_addr;
1713         newinet->inet_rcv_saddr = ireq->loc_addr;
1714         newinet->inet_saddr           = ireq->loc_addr;
1715         inet_opt              = ireq->opt;
1716         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1717         ireq->opt             = NULL;
1718         newinet->mc_index     = inet_iif(skb);
1719         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1720         newinet->rcv_tos      = ip_hdr(skb)->tos;
1721         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1722         if (inet_opt)
1723                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1724         newinet->inet_id = newtp->write_seq ^ jiffies;
1725
1726         if (!dst) {
1727                 dst = inet_csk_route_child_sock(sk, newsk, req);
1728                 if (!dst)
1729                         goto put_and_exit;
1730         } else {
1731                 /* syncookie case : see end of cookie_v4_check() */
1732         }
1733         sk_setup_caps(newsk, dst);
1734
1735         tcp_mtup_init(newsk);
1736         tcp_sync_mss(newsk, dst_mtu(dst));
1737         newtp->advmss = dst_metric_advmss(dst);
1738         if (tcp_sk(sk)->rx_opt.user_mss &&
1739             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1740                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1741
1742         tcp_initialize_rcv_mss(newsk);
1743         tcp_synack_rtt_meas(newsk, req);
1744         newtp->total_retrans = req->retrans;
1745
1746 #ifdef CONFIG_TCP_MD5SIG
1747         /* Copy over the MD5 key from the original socket */
1748         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1749                                 AF_INET);
1750         if (key != NULL) {
1751                 /*
1752                  * We're using one, so create a matching key
1753                  * on the newsk structure. If we fail to get
1754                  * memory, then we end up not copying the key
1755                  * across. Shucks.
1756                  */
1757                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1758                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1759                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1760         }
1761 #endif
1762
1763         if (__inet_inherit_port(sk, newsk) < 0)
1764                 goto put_and_exit;
1765         __inet_hash_nolisten(newsk, NULL);
1766
1767         return newsk;
1768
1769 exit_overflow:
1770         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1771 exit_nonewsk:
1772         dst_release(dst);
1773 exit:
1774         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1775         return NULL;
1776 put_and_exit:
1777         tcp_clear_xmit_timers(newsk);
1778         tcp_cleanup_congestion_control(newsk);
1779         bh_unlock_sock(newsk);
1780         sock_put(newsk);
1781         goto exit;
1782 }
1783 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1784
1785 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1786 {
1787         struct tcphdr *th = tcp_hdr(skb);
1788         const struct iphdr *iph = ip_hdr(skb);
1789         struct sock *nsk;
1790         struct request_sock **prev;
1791         /* Find possible connection requests. */
1792         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1793                                                        iph->saddr, iph->daddr);
1794         if (req)
1795                 return tcp_check_req(sk, skb, req, prev, false);
1796
1797         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1798                         th->source, iph->daddr, th->dest, inet_iif(skb));
1799
1800         if (nsk) {
1801                 if (nsk->sk_state != TCP_TIME_WAIT) {
1802                         bh_lock_sock(nsk);
1803                         return nsk;
1804                 }
1805                 inet_twsk_put(inet_twsk(nsk));
1806                 return NULL;
1807         }
1808
1809 #ifdef CONFIG_SYN_COOKIES
1810         if (!th->syn)
1811                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1812 #endif
1813         return sk;
1814 }
1815
1816 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1817 {
1818         const struct iphdr *iph = ip_hdr(skb);
1819
1820         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1821                 if (!tcp_v4_check(skb->len, iph->saddr,
1822                                   iph->daddr, skb->csum)) {
1823                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1824                         return 0;
1825                 }
1826         }
1827
1828         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1829                                        skb->len, IPPROTO_TCP, 0);
1830
1831         if (skb->len <= 76) {
1832                 return __skb_checksum_complete(skb);
1833         }
1834         return 0;
1835 }
1836
1837
1838 /* The socket must have it's spinlock held when we get
1839  * here.
1840  *
1841  * We have a potential double-lock case here, so even when
1842  * doing backlog processing we use the BH locking scheme.
1843  * This is because we cannot sleep with the original spinlock
1844  * held.
1845  */
1846 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1847 {
1848         struct sock *rsk;
1849 #ifdef CONFIG_TCP_MD5SIG
1850         /*
1851          * We really want to reject the packet as early as possible
1852          * if:
1853          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1854          *  o There is an MD5 option and we're not expecting one
1855          */
1856         if (tcp_v4_inbound_md5_hash(sk, skb))
1857                 goto discard;
1858 #endif
1859
1860         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1861                 struct dst_entry *dst = sk->sk_rx_dst;
1862
1863                 sock_rps_save_rxhash(sk, skb);
1864                 if (dst) {
1865                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1866                             dst->ops->check(dst, 0) == NULL) {
1867                                 dst_release(dst);
1868                                 sk->sk_rx_dst = NULL;
1869                         }
1870                 }
1871                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1872                         rsk = sk;
1873                         goto reset;
1874                 }
1875                 return 0;
1876         }
1877
1878         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1879                 goto csum_err;
1880
1881         if (sk->sk_state == TCP_LISTEN) {
1882                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1883                 if (!nsk)
1884                         goto discard;
1885
1886                 if (nsk != sk) {
1887                         sock_rps_save_rxhash(nsk, skb);
1888                         if (tcp_child_process(sk, nsk, skb)) {
1889                                 rsk = nsk;
1890                                 goto reset;
1891                         }
1892                         return 0;
1893                 }
1894         } else
1895                 sock_rps_save_rxhash(sk, skb);
1896
1897         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1898                 rsk = sk;
1899                 goto reset;
1900         }
1901         return 0;
1902
1903 reset:
1904         tcp_v4_send_reset(rsk, skb);
1905 discard:
1906         kfree_skb(skb);
1907         /* Be careful here. If this function gets more complicated and
1908          * gcc suffers from register pressure on the x86, sk (in %ebx)
1909          * might be destroyed here. This current version compiles correctly,
1910          * but you have been warned.
1911          */
1912         return 0;
1913
1914 csum_err:
1915         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1916         goto discard;
1917 }
1918 EXPORT_SYMBOL(tcp_v4_do_rcv);
1919
1920 void tcp_v4_early_demux(struct sk_buff *skb)
1921 {
1922         struct net *net = dev_net(skb->dev);
1923         const struct iphdr *iph;
1924         const struct tcphdr *th;
1925         struct sock *sk;
1926
1927         if (skb->pkt_type != PACKET_HOST)
1928                 return;
1929
1930         if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1931                 return;
1932
1933         iph = ip_hdr(skb);
1934         th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1935
1936         if (th->doff < sizeof(struct tcphdr) / 4)
1937                 return;
1938
1939         sk = __inet_lookup_established(net, &tcp_hashinfo,
1940                                        iph->saddr, th->source,
1941                                        iph->daddr, ntohs(th->dest),
1942                                        skb->skb_iif);
1943         if (sk) {
1944                 skb->sk = sk;
1945                 skb->destructor = sock_edemux;
1946                 if (sk->sk_state != TCP_TIME_WAIT) {
1947                         struct dst_entry *dst = sk->sk_rx_dst;
1948
1949                         if (dst)
1950                                 dst = dst_check(dst, 0);
1951                         if (dst &&
1952                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1953                                 skb_dst_set_noref(skb, dst);
1954                 }
1955         }
1956 }
1957
1958 /*
1959  *      From tcp_input.c
1960  */
1961
1962 int tcp_v4_rcv(struct sk_buff *skb)
1963 {
1964         const struct iphdr *iph;
1965         const struct tcphdr *th;
1966         struct sock *sk;
1967         int ret;
1968         struct net *net = dev_net(skb->dev);
1969
1970         if (skb->pkt_type != PACKET_HOST)
1971                 goto discard_it;
1972
1973         /* Count it even if it's bad */
1974         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1975
1976         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1977                 goto discard_it;
1978
1979         th = tcp_hdr(skb);
1980
1981         if (th->doff < sizeof(struct tcphdr) / 4)
1982                 goto bad_packet;
1983         if (!pskb_may_pull(skb, th->doff * 4))
1984                 goto discard_it;
1985
1986         /* An explanation is required here, I think.
1987          * Packet length and doff are validated by header prediction,
1988          * provided case of th->doff==0 is eliminated.
1989          * So, we defer the checks. */
1990         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1991                 goto bad_packet;
1992
1993         th = tcp_hdr(skb);
1994         iph = ip_hdr(skb);
1995         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1996         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1997                                     skb->len - th->doff * 4);
1998         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1999         TCP_SKB_CB(skb)->when    = 0;
2000         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2001         TCP_SKB_CB(skb)->sacked  = 0;
2002
2003         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2004         if (!sk)
2005                 goto no_tcp_socket;
2006
2007 process:
2008         if (sk->sk_state == TCP_TIME_WAIT)
2009                 goto do_time_wait;
2010
2011         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2012                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2013                 goto discard_and_relse;
2014         }
2015
2016         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2017                 goto discard_and_relse;
2018         nf_reset(skb);
2019
2020         if (sk_filter(sk, skb))
2021                 goto discard_and_relse;
2022
2023         skb->dev = NULL;
2024
2025         bh_lock_sock_nested(sk);
2026         ret = 0;
2027         if (!sock_owned_by_user(sk)) {
2028 #ifdef CONFIG_NET_DMA
2029                 struct tcp_sock *tp = tcp_sk(sk);
2030                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2031                         tp->ucopy.dma_chan = net_dma_find_channel();
2032                 if (tp->ucopy.dma_chan)
2033                         ret = tcp_v4_do_rcv(sk, skb);
2034                 else
2035 #endif
2036                 {
2037                         if (!tcp_prequeue(sk, skb))
2038                                 ret = tcp_v4_do_rcv(sk, skb);
2039                 }
2040         } else if (unlikely(sk_add_backlog(sk, skb,
2041                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
2042                 bh_unlock_sock(sk);
2043                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2044                 goto discard_and_relse;
2045         }
2046         bh_unlock_sock(sk);
2047
2048         sock_put(sk);
2049
2050         return ret;
2051
2052 no_tcp_socket:
2053         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2054                 goto discard_it;
2055
2056         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2057 bad_packet:
2058                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2059         } else {
2060                 tcp_v4_send_reset(NULL, skb);
2061         }
2062
2063 discard_it:
2064         /* Discard frame. */
2065         kfree_skb(skb);
2066         return 0;
2067
2068 discard_and_relse:
2069         sock_put(sk);
2070         goto discard_it;
2071
2072 do_time_wait:
2073         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2074                 inet_twsk_put(inet_twsk(sk));
2075                 goto discard_it;
2076         }
2077
2078         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2079                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2080                 inet_twsk_put(inet_twsk(sk));
2081                 goto discard_it;
2082         }
2083         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2084         case TCP_TW_SYN: {
2085                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2086                                                         &tcp_hashinfo,
2087                                                         iph->daddr, th->dest,
2088                                                         inet_iif(skb));
2089                 if (sk2) {
2090                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2091                         inet_twsk_put(inet_twsk(sk));
2092                         sk = sk2;
2093                         goto process;
2094                 }
2095                 /* Fall through to ACK */
2096         }
2097         case TCP_TW_ACK:
2098                 tcp_v4_timewait_ack(sk, skb);
2099                 break;
2100         case TCP_TW_RST:
2101                 goto no_tcp_socket;
2102         case TCP_TW_SUCCESS:;
2103         }
2104         goto discard_it;
2105 }
2106
2107 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2108         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2109         .twsk_unique    = tcp_twsk_unique,
2110         .twsk_destructor= tcp_twsk_destructor,
2111 };
2112
2113 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2114 {
2115         struct dst_entry *dst = skb_dst(skb);
2116
2117         dst_hold(dst);
2118         sk->sk_rx_dst = dst;
2119         inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2120 }
2121 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2122
2123 const struct inet_connection_sock_af_ops ipv4_specific = {
2124         .queue_xmit        = ip_queue_xmit,
2125         .send_check        = tcp_v4_send_check,
2126         .rebuild_header    = inet_sk_rebuild_header,
2127         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2128         .conn_request      = tcp_v4_conn_request,
2129         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2130         .net_header_len    = sizeof(struct iphdr),
2131         .setsockopt        = ip_setsockopt,
2132         .getsockopt        = ip_getsockopt,
2133         .addr2sockaddr     = inet_csk_addr2sockaddr,
2134         .sockaddr_len      = sizeof(struct sockaddr_in),
2135         .bind_conflict     = inet_csk_bind_conflict,
2136 #ifdef CONFIG_COMPAT
2137         .compat_setsockopt = compat_ip_setsockopt,
2138         .compat_getsockopt = compat_ip_getsockopt,
2139 #endif
2140 };
2141 EXPORT_SYMBOL(ipv4_specific);
2142
2143 #ifdef CONFIG_TCP_MD5SIG
2144 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2145         .md5_lookup             = tcp_v4_md5_lookup,
2146         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2147         .md5_parse              = tcp_v4_parse_md5_keys,
2148 };
2149 #endif
2150
2151 /* NOTE: A lot of things set to zero explicitly by call to
2152  *       sk_alloc() so need not be done here.
2153  */
2154 static int tcp_v4_init_sock(struct sock *sk)
2155 {
2156         struct inet_connection_sock *icsk = inet_csk(sk);
2157
2158         tcp_init_sock(sk);
2159
2160         icsk->icsk_af_ops = &ipv4_specific;
2161
2162 #ifdef CONFIG_TCP_MD5SIG
2163         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2164 #endif
2165
2166         return 0;
2167 }
2168
2169 void tcp_v4_destroy_sock(struct sock *sk)
2170 {
2171         struct tcp_sock *tp = tcp_sk(sk);
2172
2173         tcp_clear_xmit_timers(sk);
2174
2175         tcp_cleanup_congestion_control(sk);
2176
2177         /* Cleanup up the write buffer. */
2178         tcp_write_queue_purge(sk);
2179
2180         /* Cleans up our, hopefully empty, out_of_order_queue. */
2181         __skb_queue_purge(&tp->out_of_order_queue);
2182
2183 #ifdef CONFIG_TCP_MD5SIG
2184         /* Clean up the MD5 key list, if any */
2185         if (tp->md5sig_info) {
2186                 tcp_clear_md5_list(sk);
2187                 kfree_rcu(tp->md5sig_info, rcu);
2188                 tp->md5sig_info = NULL;
2189         }
2190 #endif
2191
2192 #ifdef CONFIG_NET_DMA
2193         /* Cleans up our sk_async_wait_queue */
2194         __skb_queue_purge(&sk->sk_async_wait_queue);
2195 #endif
2196
2197         /* Clean prequeue, it must be empty really */
2198         __skb_queue_purge(&tp->ucopy.prequeue);
2199
2200         /* Clean up a referenced TCP bind bucket. */
2201         if (inet_csk(sk)->icsk_bind_hash)
2202                 inet_put_port(sk);
2203
2204         /* TCP Cookie Transactions */
2205         if (tp->cookie_values != NULL) {
2206                 kref_put(&tp->cookie_values->kref,
2207                          tcp_cookie_values_release);
2208                 tp->cookie_values = NULL;
2209         }
2210         BUG_ON(tp->fastopen_rsk != NULL);
2211
2212         /* If socket is aborted during connect operation */
2213         tcp_free_fastopen_req(tp);
2214
2215         sk_sockets_allocated_dec(sk);
2216         sock_release_memcg(sk);
2217 }
2218 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2219
2220 #ifdef CONFIG_PROC_FS
2221 /* Proc filesystem TCP sock list dumping. */
2222
2223 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2224 {
2225         return hlist_nulls_empty(head) ? NULL :
2226                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2227 }
2228
2229 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2230 {
2231         return !is_a_nulls(tw->tw_node.next) ?
2232                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2233 }
2234
2235 /*
2236  * Get next listener socket follow cur.  If cur is NULL, get first socket
2237  * starting from bucket given in st->bucket; when st->bucket is zero the
2238  * very first socket in the hash table is returned.
2239  */
2240 static void *listening_get_next(struct seq_file *seq, void *cur)
2241 {
2242         struct inet_connection_sock *icsk;
2243         struct hlist_nulls_node *node;
2244         struct sock *sk = cur;
2245         struct inet_listen_hashbucket *ilb;
2246         struct tcp_iter_state *st = seq->private;
2247         struct net *net = seq_file_net(seq);
2248
2249         if (!sk) {
2250                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2251                 spin_lock_bh(&ilb->lock);
2252                 sk = sk_nulls_head(&ilb->head);
2253                 st->offset = 0;
2254                 goto get_sk;
2255         }
2256         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2257         ++st->num;
2258         ++st->offset;
2259
2260         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2261                 struct request_sock *req = cur;
2262
2263                 icsk = inet_csk(st->syn_wait_sk);
2264                 req = req->dl_next;
2265                 while (1) {
2266                         while (req) {
2267                                 if (req->rsk_ops->family == st->family) {
2268                                         cur = req;
2269                                         goto out;
2270                                 }
2271                                 req = req->dl_next;
2272                         }
2273                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2274                                 break;
2275 get_req:
2276                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2277                 }
2278                 sk        = sk_nulls_next(st->syn_wait_sk);
2279                 st->state = TCP_SEQ_STATE_LISTENING;
2280                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2281         } else {
2282                 icsk = inet_csk(sk);
2283                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2284                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2285                         goto start_req;
2286                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2287                 sk = sk_nulls_next(sk);
2288         }
2289 get_sk:
2290         sk_nulls_for_each_from(sk, node) {
2291                 if (!net_eq(sock_net(sk), net))
2292                         continue;
2293                 if (sk->sk_family == st->family) {
2294                         cur = sk;
2295                         goto out;
2296                 }
2297                 icsk = inet_csk(sk);
2298                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2299                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2300 start_req:
2301                         st->uid         = sock_i_uid(sk);
2302                         st->syn_wait_sk = sk;
2303                         st->state       = TCP_SEQ_STATE_OPENREQ;
2304                         st->sbucket     = 0;
2305                         goto get_req;
2306                 }
2307                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2308         }
2309         spin_unlock_bh(&ilb->lock);
2310         st->offset = 0;
2311         if (++st->bucket < INET_LHTABLE_SIZE) {
2312                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2313                 spin_lock_bh(&ilb->lock);
2314                 sk = sk_nulls_head(&ilb->head);
2315                 goto get_sk;
2316         }
2317         cur = NULL;
2318 out:
2319         return cur;
2320 }
2321
2322 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2323 {
2324         struct tcp_iter_state *st = seq->private;
2325         void *rc;
2326
2327         st->bucket = 0;
2328         st->offset = 0;
2329         rc = listening_get_next(seq, NULL);
2330
2331         while (rc && *pos) {
2332                 rc = listening_get_next(seq, rc);
2333                 --*pos;
2334         }
2335         return rc;
2336 }
2337
2338 static inline bool empty_bucket(struct tcp_iter_state *st)
2339 {
2340         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2341                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2342 }
2343
2344 /*
2345  * Get first established socket starting from bucket given in st->bucket.
2346  * If st->bucket is zero, the very first socket in the hash is returned.
2347  */
2348 static void *established_get_first(struct seq_file *seq)
2349 {
2350         struct tcp_iter_state *st = seq->private;
2351         struct net *net = seq_file_net(seq);
2352         void *rc = NULL;
2353
2354         st->offset = 0;
2355         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2356                 struct sock *sk;
2357                 struct hlist_nulls_node *node;
2358                 struct inet_timewait_sock *tw;
2359                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2360
2361                 /* Lockless fast path for the common case of empty buckets */
2362                 if (empty_bucket(st))
2363                         continue;
2364
2365                 spin_lock_bh(lock);
2366                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2367                         if (sk->sk_family != st->family ||
2368                             !net_eq(sock_net(sk), net)) {
2369                                 continue;
2370                         }
2371                         rc = sk;
2372                         goto out;
2373                 }
2374                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2375                 inet_twsk_for_each(tw, node,
2376                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2377                         if (tw->tw_family != st->family ||
2378                             !net_eq(twsk_net(tw), net)) {
2379                                 continue;
2380                         }
2381                         rc = tw;
2382                         goto out;
2383                 }
2384                 spin_unlock_bh(lock);
2385                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2386         }
2387 out:
2388         return rc;
2389 }
2390
2391 static void *established_get_next(struct seq_file *seq, void *cur)
2392 {
2393         struct sock *sk = cur;
2394         struct inet_timewait_sock *tw;
2395         struct hlist_nulls_node *node;
2396         struct tcp_iter_state *st = seq->private;
2397         struct net *net = seq_file_net(seq);
2398
2399         ++st->num;
2400         ++st->offset;
2401
2402         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2403                 tw = cur;
2404                 tw = tw_next(tw);
2405 get_tw:
2406                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2407                         tw = tw_next(tw);
2408                 }
2409                 if (tw) {
2410                         cur = tw;
2411                         goto out;
2412                 }
2413                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2414                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2415
2416                 /* Look for next non empty bucket */
2417                 st->offset = 0;
2418                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2419                                 empty_bucket(st))
2420                         ;
2421                 if (st->bucket > tcp_hashinfo.ehash_mask)
2422                         return NULL;
2423
2424                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2425                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2426         } else
2427                 sk = sk_nulls_next(sk);
2428
2429         sk_nulls_for_each_from(sk, node) {
2430                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2431                         goto found;
2432         }
2433
2434         st->state = TCP_SEQ_STATE_TIME_WAIT;
2435         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2436         goto get_tw;
2437 found:
2438         cur = sk;
2439 out:
2440         return cur;
2441 }
2442
2443 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2444 {
2445         struct tcp_iter_state *st = seq->private;
2446         void *rc;
2447
2448         st->bucket = 0;
2449         rc = established_get_first(seq);
2450
2451         while (rc && pos) {
2452                 rc = established_get_next(seq, rc);
2453                 --pos;
2454         }
2455         return rc;
2456 }
2457
2458 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2459 {
2460         void *rc;
2461         struct tcp_iter_state *st = seq->private;
2462
2463         st->state = TCP_SEQ_STATE_LISTENING;
2464         rc        = listening_get_idx(seq, &pos);
2465
2466         if (!rc) {
2467                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2468                 rc        = established_get_idx(seq, pos);
2469         }
2470
2471         return rc;
2472 }
2473
2474 static void *tcp_seek_last_pos(struct seq_file *seq)
2475 {
2476         struct tcp_iter_state *st = seq->private;
2477         int offset = st->offset;
2478         int orig_num = st->num;
2479         void *rc = NULL;
2480
2481         switch (st->state) {
2482         case TCP_SEQ_STATE_OPENREQ:
2483         case TCP_SEQ_STATE_LISTENING:
2484                 if (st->bucket >= INET_LHTABLE_SIZE)
2485                         break;
2486                 st->state = TCP_SEQ_STATE_LISTENING;
2487                 rc = listening_get_next(seq, NULL);
2488                 while (offset-- && rc)
2489                         rc = listening_get_next(seq, rc);
2490                 if (rc)
2491                         break;
2492                 st->bucket = 0;
2493                 /* Fallthrough */
2494         case TCP_SEQ_STATE_ESTABLISHED:
2495         case TCP_SEQ_STATE_TIME_WAIT:
2496                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2497                 if (st->bucket > tcp_hashinfo.ehash_mask)
2498                         break;
2499                 rc = established_get_first(seq);
2500                 while (offset-- && rc)
2501                         rc = established_get_next(seq, rc);
2502         }
2503
2504         st->num = orig_num;
2505
2506         return rc;
2507 }
2508
2509 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2510 {
2511         struct tcp_iter_state *st = seq->private;
2512         void *rc;
2513
2514         if (*pos && *pos == st->last_pos) {
2515                 rc = tcp_seek_last_pos(seq);
2516                 if (rc)
2517                         goto out;
2518         }
2519
2520         st->state = TCP_SEQ_STATE_LISTENING;
2521         st->num = 0;
2522         st->bucket = 0;
2523         st->offset = 0;
2524         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2525
2526 out:
2527         st->last_pos = *pos;
2528         return rc;
2529 }
2530
2531 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2532 {
2533         struct tcp_iter_state *st = seq->private;
2534         void *rc = NULL;
2535
2536         if (v == SEQ_START_TOKEN) {
2537                 rc = tcp_get_idx(seq, 0);
2538                 goto out;
2539         }
2540
2541         switch (st->state) {
2542         case TCP_SEQ_STATE_OPENREQ:
2543         case TCP_SEQ_STATE_LISTENING:
2544                 rc = listening_get_next(seq, v);
2545                 if (!rc) {
2546                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2547                         st->bucket = 0;
2548                         st->offset = 0;
2549                         rc        = established_get_first(seq);
2550                 }
2551                 break;
2552         case TCP_SEQ_STATE_ESTABLISHED:
2553         case TCP_SEQ_STATE_TIME_WAIT:
2554                 rc = established_get_next(seq, v);
2555                 break;
2556         }
2557 out:
2558         ++*pos;
2559         st->last_pos = *pos;
2560         return rc;
2561 }
2562
2563 static void tcp_seq_stop(struct seq_file *seq, void *v)
2564 {
2565         struct tcp_iter_state *st = seq->private;
2566
2567         switch (st->state) {
2568         case TCP_SEQ_STATE_OPENREQ:
2569                 if (v) {
2570                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2571                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2572                 }
2573         case TCP_SEQ_STATE_LISTENING:
2574                 if (v != SEQ_START_TOKEN)
2575                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2576                 break;
2577         case TCP_SEQ_STATE_TIME_WAIT:
2578         case TCP_SEQ_STATE_ESTABLISHED:
2579                 if (v)
2580                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2581                 break;
2582         }
2583 }
2584
2585 int tcp_seq_open(struct inode *inode, struct file *file)
2586 {
2587         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2588         struct tcp_iter_state *s;
2589         int err;
2590
2591         err = seq_open_net(inode, file, &afinfo->seq_ops,
2592                           sizeof(struct tcp_iter_state));
2593         if (err < 0)
2594                 return err;
2595
2596         s = ((struct seq_file *)file->private_data)->private;
2597         s->family               = afinfo->family;
2598         s->last_pos             = 0;
2599         return 0;
2600 }
2601 EXPORT_SYMBOL(tcp_seq_open);
2602
2603 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2604 {
2605         int rc = 0;
2606         struct proc_dir_entry *p;
2607
2608         afinfo->seq_ops.start           = tcp_seq_start;
2609         afinfo->seq_ops.next            = tcp_seq_next;
2610         afinfo->seq_ops.stop            = tcp_seq_stop;
2611
2612         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2613                              afinfo->seq_fops, afinfo);
2614         if (!p)
2615                 rc = -ENOMEM;
2616         return rc;
2617 }
2618 EXPORT_SYMBOL(tcp_proc_register);
2619
2620 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2621 {
2622         proc_net_remove(net, afinfo->name);
2623 }
2624 EXPORT_SYMBOL(tcp_proc_unregister);
2625
2626 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2627                          struct seq_file *f, int i, kuid_t uid, int *len)
2628 {
2629         const struct inet_request_sock *ireq = inet_rsk(req);
2630         long delta = req->expires - jiffies;
2631
2632         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2633                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2634                 i,
2635                 ireq->loc_addr,
2636                 ntohs(inet_sk(sk)->inet_sport),
2637                 ireq->rmt_addr,
2638                 ntohs(ireq->rmt_port),
2639                 TCP_SYN_RECV,
2640                 0, 0, /* could print option size, but that is af dependent. */
2641                 1,    /* timers active (only the expire timer) */
2642                 jiffies_delta_to_clock_t(delta),
2643                 req->retrans,
2644                 from_kuid_munged(seq_user_ns(f), uid),
2645                 0,  /* non standard timer */
2646                 0, /* open_requests have no inode */
2647                 atomic_read(&sk->sk_refcnt),
2648                 req,
2649                 len);
2650 }
2651
2652 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2653 {
2654         int timer_active;
2655         unsigned long timer_expires;
2656         const struct tcp_sock *tp = tcp_sk(sk);
2657         const struct inet_connection_sock *icsk = inet_csk(sk);
2658         const struct inet_sock *inet = inet_sk(sk);
2659         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2660         __be32 dest = inet->inet_daddr;
2661         __be32 src = inet->inet_rcv_saddr;
2662         __u16 destp = ntohs(inet->inet_dport);
2663         __u16 srcp = ntohs(inet->inet_sport);
2664         int rx_queue;
2665
2666         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2667                 timer_active    = 1;
2668                 timer_expires   = icsk->icsk_timeout;
2669         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2670                 timer_active    = 4;
2671                 timer_expires   = icsk->icsk_timeout;
2672         } else if (timer_pending(&sk->sk_timer)) {
2673                 timer_active    = 2;
2674                 timer_expires   = sk->sk_timer.expires;
2675         } else {
2676                 timer_active    = 0;
2677                 timer_expires = jiffies;
2678         }
2679
2680         if (sk->sk_state == TCP_LISTEN)
2681                 rx_queue = sk->sk_ack_backlog;
2682         else
2683                 /*
2684                  * because we dont lock socket, we might find a transient negative value
2685                  */
2686                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2687
2688         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2689                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2690                 i, src, srcp, dest, destp, sk->sk_state,
2691                 tp->write_seq - tp->snd_una,
2692                 rx_queue,
2693                 timer_active,
2694                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2695                 icsk->icsk_retransmits,
2696                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2697                 icsk->icsk_probes_out,
2698                 sock_i_ino(sk),
2699                 atomic_read(&sk->sk_refcnt), sk,
2700                 jiffies_to_clock_t(icsk->icsk_rto),
2701                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2702                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2703                 tp->snd_cwnd,
2704                 sk->sk_state == TCP_LISTEN ?
2705                     (fastopenq ? fastopenq->max_qlen : 0) :
2706                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2707                 len);
2708 }
2709
2710 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2711                                struct seq_file *f, int i, int *len)
2712 {
2713         __be32 dest, src;
2714         __u16 destp, srcp;
2715         long delta = tw->tw_ttd - jiffies;
2716
2717         dest  = tw->tw_daddr;
2718         src   = tw->tw_rcv_saddr;
2719         destp = ntohs(tw->tw_dport);
2720         srcp  = ntohs(tw->tw_sport);
2721
2722         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2723                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2724                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2725                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2726                 atomic_read(&tw->tw_refcnt), tw, len);
2727 }
2728
2729 #define TMPSZ 150
2730
2731 static int tcp4_seq_show(struct seq_file *seq, void *v)
2732 {
2733         struct tcp_iter_state *st;
2734         int len;
2735
2736         if (v == SEQ_START_TOKEN) {
2737                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2738                            "  sl  local_address rem_address   st tx_queue "
2739                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2740                            "inode");
2741                 goto out;
2742         }
2743         st = seq->private;
2744
2745         switch (st->state) {
2746         case TCP_SEQ_STATE_LISTENING:
2747         case TCP_SEQ_STATE_ESTABLISHED:
2748                 get_tcp4_sock(v, seq, st->num, &len);
2749                 break;
2750         case TCP_SEQ_STATE_OPENREQ:
2751                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2752                 break;
2753         case TCP_SEQ_STATE_TIME_WAIT:
2754                 get_timewait4_sock(v, seq, st->num, &len);
2755                 break;
2756         }
2757         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2758 out:
2759         return 0;
2760 }
2761
2762 static const struct file_operations tcp_afinfo_seq_fops = {
2763         .owner   = THIS_MODULE,
2764         .open    = tcp_seq_open,
2765         .read    = seq_read,
2766         .llseek  = seq_lseek,
2767         .release = seq_release_net
2768 };
2769
2770 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2771         .name           = "tcp",
2772         .family         = AF_INET,
2773         .seq_fops       = &tcp_afinfo_seq_fops,
2774         .seq_ops        = {
2775                 .show           = tcp4_seq_show,
2776         },
2777 };
2778
2779 static int __net_init tcp4_proc_init_net(struct net *net)
2780 {
2781         return tcp_proc_register(net, &tcp4_seq_afinfo);
2782 }
2783
2784 static void __net_exit tcp4_proc_exit_net(struct net *net)
2785 {
2786         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2787 }
2788
2789 static struct pernet_operations tcp4_net_ops = {
2790         .init = tcp4_proc_init_net,
2791         .exit = tcp4_proc_exit_net,
2792 };
2793
2794 int __init tcp4_proc_init(void)
2795 {
2796         return register_pernet_subsys(&tcp4_net_ops);
2797 }
2798
2799 void tcp4_proc_exit(void)
2800 {
2801         unregister_pernet_subsys(&tcp4_net_ops);
2802 }
2803 #endif /* CONFIG_PROC_FS */
2804
2805 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2806 {
2807         const struct iphdr *iph = skb_gro_network_header(skb);
2808         __wsum wsum;
2809         __sum16 sum;
2810
2811         switch (skb->ip_summed) {
2812         case CHECKSUM_COMPLETE:
2813                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2814                                   skb->csum)) {
2815                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2816                         break;
2817                 }
2818 flush:
2819                 NAPI_GRO_CB(skb)->flush = 1;
2820                 return NULL;
2821
2822         case CHECKSUM_NONE:
2823                 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2824                                           skb_gro_len(skb), IPPROTO_TCP, 0);
2825                 sum = csum_fold(skb_checksum(skb,
2826                                              skb_gro_offset(skb),
2827                                              skb_gro_len(skb),
2828                                              wsum));
2829                 if (sum)
2830                         goto flush;
2831
2832                 skb->ip_summed = CHECKSUM_UNNECESSARY;
2833                 break;
2834         }
2835
2836         return tcp_gro_receive(head, skb);
2837 }
2838
2839 int tcp4_gro_complete(struct sk_buff *skb)
2840 {
2841         const struct iphdr *iph = ip_hdr(skb);
2842         struct tcphdr *th = tcp_hdr(skb);
2843
2844         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2845                                   iph->saddr, iph->daddr, 0);
2846         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2847
2848         return tcp_gro_complete(skb);
2849 }
2850
2851 struct proto tcp_prot = {
2852         .name                   = "TCP",
2853         .owner                  = THIS_MODULE,
2854         .close                  = tcp_close,
2855         .connect                = tcp_v4_connect,
2856         .disconnect             = tcp_disconnect,
2857         .accept                 = inet_csk_accept,
2858         .ioctl                  = tcp_ioctl,
2859         .init                   = tcp_v4_init_sock,
2860         .destroy                = tcp_v4_destroy_sock,
2861         .shutdown               = tcp_shutdown,
2862         .setsockopt             = tcp_setsockopt,
2863         .getsockopt             = tcp_getsockopt,
2864         .recvmsg                = tcp_recvmsg,
2865         .sendmsg                = tcp_sendmsg,
2866         .sendpage               = tcp_sendpage,
2867         .backlog_rcv            = tcp_v4_do_rcv,
2868         .release_cb             = tcp_release_cb,
2869         .mtu_reduced            = tcp_v4_mtu_reduced,
2870         .hash                   = inet_hash,
2871         .unhash                 = inet_unhash,
2872         .get_port               = inet_csk_get_port,
2873         .enter_memory_pressure  = tcp_enter_memory_pressure,
2874         .sockets_allocated      = &tcp_sockets_allocated,
2875         .orphan_count           = &tcp_orphan_count,
2876         .memory_allocated       = &tcp_memory_allocated,
2877         .memory_pressure        = &tcp_memory_pressure,
2878         .sysctl_wmem            = sysctl_tcp_wmem,
2879         .sysctl_rmem            = sysctl_tcp_rmem,
2880         .max_header             = MAX_TCP_HEADER,
2881         .obj_size               = sizeof(struct tcp_sock),
2882         .slab_flags             = SLAB_DESTROY_BY_RCU,
2883         .twsk_prot              = &tcp_timewait_sock_ops,
2884         .rsk_prot               = &tcp_request_sock_ops,
2885         .h.hashinfo             = &tcp_hashinfo,
2886         .no_autobind            = true,
2887 #ifdef CONFIG_COMPAT
2888         .compat_setsockopt      = compat_tcp_setsockopt,
2889         .compat_getsockopt      = compat_tcp_getsockopt,
2890 #endif
2891 #ifdef CONFIG_MEMCG_KMEM
2892         .init_cgroup            = tcp_init_cgroup,
2893         .destroy_cgroup         = tcp_destroy_cgroup,
2894         .proto_cgroup           = tcp_proto_cgroup,
2895 #endif
2896 };
2897 EXPORT_SYMBOL(tcp_prot);
2898
2899 static int __net_init tcp_sk_init(struct net *net)
2900 {
2901         return 0;
2902 }
2903
2904 static void __net_exit tcp_sk_exit(struct net *net)
2905 {
2906 }
2907
2908 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2909 {
2910         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2911 }
2912
2913 static struct pernet_operations __net_initdata tcp_sk_ops = {
2914        .init       = tcp_sk_init,
2915        .exit       = tcp_sk_exit,
2916        .exit_batch = tcp_sk_exit_batch,
2917 };
2918
2919 void __init tcp_v4_init(void)
2920 {
2921         inet_hashinfo_init(&tcp_hashinfo);
2922         if (register_pernet_subsys(&tcp_sk_ops))
2923                 panic("Failed to create the TCP control socket.\n");
2924 }