net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203         struct inet_timewait_death_row *tcp_death_row;
 204         struct inet_sock *inet = inet_sk(sk);
 205         struct tcp_sock *tp = tcp_sk(sk);
 206         struct ip_options_rcu *inet_opt;
 207         struct net *net = sock_net(sk);
 208         __be16 orig_sport, orig_dport;
 209         __be32 daddr, nexthop;
 210         struct flowi4 *fl4;
 211         struct rtable *rt;
 212         int err;
 213
 214         if (addr_len < sizeof(struct sockaddr_in))
 215                 return -EINVAL;
 216
 217         if (usin->sin_family != AF_INET)
 218                 return -EAFNOSUPPORT;
 219
 220         nexthop = daddr = usin->sin_addr.s_addr;
 221         inet_opt = rcu_dereference_protected(inet->inet_opt,
 222                                              lockdep_sock_is_held(sk));
 223         if (inet_opt && inet_opt->opt.srr) {
 224                 if (!daddr)
 225                         return -EINVAL;
 226                 nexthop = inet_opt->opt.faddr;
 227         }
 228
 229         orig_sport = inet->inet_sport;
 230         orig_dport = usin->sin_port;
 231         fl4 = &inet->cork.fl.u.ip4;
 232         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 233                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 234                               orig_dport, sk);
 235         if (IS_ERR(rt)) {
 236                 err = PTR_ERR(rt);
 237                 if (err == -ENETUNREACH)
 238                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 239                 return err;
 240         }
 241
 242         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 243                 ip_rt_put(rt);
 244                 return -ENETUNREACH;
 245         }
 246
 247         if (!inet_opt || !inet_opt->opt.srr)
 248                 daddr = fl4->daddr;
 249
 250         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 251
 252         if (!inet->inet_saddr) {
 253                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
 254                 if (err) {
 255                         ip_rt_put(rt);
 256                         return err;
 257                 }
 258         } else {
 259                 sk_rcv_saddr_set(sk, inet->inet_saddr);
 260         }
 261
 262         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 263                 /* Reset inherited state */
 264                 tp->rx_opt.ts_recent       = 0;
 265                 tp->rx_opt.ts_recent_stamp = 0;
 266                 if (likely(!tp->repair))
 267                         WRITE_ONCE(tp->write_seq, 0);
 268         }
 269
 270         inet->inet_dport = usin->sin_port;
 271         sk_daddr_set(sk, daddr);
 272
 273         inet_csk(sk)->icsk_ext_hdr_len = 0;
 274         if (inet_opt)
 275                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 276
 277         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 278
 279         /* Socket identity is still unknown (sport may be zero).
 280          * However we set state to SYN-SENT and not releasing socket
 281          * lock select source port, enter ourselves into the hash tables and
 282          * complete initialization after this.
 283          */
 284         tcp_set_state(sk, TCP_SYN_SENT);
 285         err = inet_hash_connect(tcp_death_row, sk);
 286         if (err)
 287                 goto failure;
 288
 289         sk_set_txhash(sk);
 290
 291         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 292                                inet->inet_sport, inet->inet_dport, sk);
 293         if (IS_ERR(rt)) {
 294                 err = PTR_ERR(rt);
 295                 rt = NULL;
 296                 goto failure;
 297         }
 298         /* OK, now commit destination to socket.  */
 299         sk->sk_gso_type = SKB_GSO_TCPV4;
 300         sk_setup_caps(sk, &rt->dst);
 301         rt = NULL;
 302
 303         if (likely(!tp->repair)) {
 304                 if (!tp->write_seq)
 305                         WRITE_ONCE(tp->write_seq,
 306                                    secure_tcp_seq(inet->inet_saddr,
 307                                                   inet->inet_daddr,
 308                                                   inet->inet_sport,
 309                                                   usin->sin_port));
 310                 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
 311                                                  inet->inet_daddr);
 312         }
 313
 314         inet->inet_id = get_random_u16();
 315
 316         if (tcp_fastopen_defer_connect(sk, &err))
 317                 return err;
 318         if (err)
 319                 goto failure;
 320
 321         err = tcp_connect(sk);
 322
 323         if (err)
 324                 goto failure;
 325
 326         return 0;
 327
 328 failure:
 329         /*
 330          * This unhashes the socket and releases the local port,
 331          * if necessary.
 332          */
 333         tcp_set_state(sk, TCP_CLOSE);
 334         inet_bhash2_reset_saddr(sk);
 335         ip_rt_put(rt);
 336         sk->sk_route_caps = 0;
 337         inet->inet_dport = 0;
 338         return err;
 339 }
 340 EXPORT_SYMBOL(tcp_v4_connect);
 341
 342 /*
 343  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 344  * It can be called through tcp_release_cb() if socket was owned by user
 345  * at the time tcp_v4_err() was called to handle ICMP message.
 346  */
 347 void tcp_v4_mtu_reduced(struct sock *sk)
 348 {
 349         struct inet_sock *inet = inet_sk(sk);
 350         struct dst_entry *dst;
 351         u32 mtu;
 352
 353         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 354                 return;
 355         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 356         dst = inet_csk_update_pmtu(sk, mtu);
 357         if (!dst)
 358                 return;
 359
 360         /* Something is about to be wrong... Remember soft error
 361          * for the case, if this connection will not able to recover.
 362          */
 363         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 364                 sk->sk_err_soft = EMSGSIZE;
 365
 366         mtu = dst_mtu(dst);
 367
 368         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 369             ip_sk_accept_pmtu(sk) &&
 370             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 371                 tcp_sync_mss(sk, mtu);
 372
 373                 /* Resend the TCP packet because it's
 374                  * clear that the old packet has been
 375                  * dropped. This is the new "fast" path mtu
 376                  * discovery.
 377                  */
 378                 tcp_simple_retransmit(sk);
 379         } /* else let the usual retransmit timer handle it */
 380 }
 381 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 382
 383 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 384 {
 385         struct dst_entry *dst = __sk_dst_check(sk, 0);
 386
 387         if (dst)
 388                 dst->ops->redirect(dst, sk, skb);
 389 }
 390
 391
 392 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 393 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 394 {
 395         struct request_sock *req = inet_reqsk(sk);
 396         struct net *net = sock_net(sk);
 397
 398         /* ICMPs are not backlogged, hence we cannot get
 399          * an established socket here.
 400          */
 401         if (seq != tcp_rsk(req)->snt_isn) {
 402                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 403         } else if (abort) {
 404                 /*
 405                  * Still in SYN_RECV, just remove it silently.
 406                  * There is no good way to pass the error to the newly
 407                  * created socket, and POSIX does not want network
 408                  * errors returned from accept().
 409                  */
 410                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 411                 tcp_listendrop(req->rsk_listener);
 412         }
 413         reqsk_put(req);
 414 }
 415 EXPORT_SYMBOL(tcp_req_err);
 416
 417 /* TCP-LD (RFC 6069) logic */
 418 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 419 {
 420         struct inet_connection_sock *icsk = inet_csk(sk);
 421         struct tcp_sock *tp = tcp_sk(sk);
 422         struct sk_buff *skb;
 423         s32 remaining;
 424         u32 delta_us;
 425
 426         if (sock_owned_by_user(sk))
 427                 return;
 428
 429         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 430             !icsk->icsk_backoff)
 431                 return;
 432
 433         skb = tcp_rtx_queue_head(sk);
 434         if (WARN_ON_ONCE(!skb))
 435                 return;
 436
 437         icsk->icsk_backoff--;
 438         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 439         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 440
 441         tcp_mstamp_refresh(tp);
 442         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 443         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 444
 445         if (remaining > 0) {
 446                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 447                                           remaining, TCP_RTO_MAX);
 448         } else {
 449                 /* RTO revert clocked out retransmission.
 450                  * Will retransmit now.
 451                  */
 452                 tcp_retransmit_timer(sk);
 453         }
 454 }
 455 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 456
 457 /*
 458  * This routine is called by the ICMP module when it gets some
 459  * sort of error condition.  If err < 0 then the socket should
 460  * be closed and the error returned to the user.  If err > 0
 461  * it's just the icmp type << 8 | icmp code.  After adjustment
 462  * header points to the first 8 bytes of the tcp header.  We need
 463  * to find the appropriate port.
 464  *
 465  * The locking strategy used here is very "optimistic". When
 466  * someone else accesses the socket the ICMP is just dropped
 467  * and for some paths there is no check at all.
 468  * A more general error queue to queue errors for later handling
 469  * is probably better.
 470  *
 471  */
 472
 473 int tcp_v4_err(struct sk_buff *skb, u32 info)
 474 {
 475         const struct iphdr *iph = (const struct iphdr *)skb->data;
 476         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 477         struct tcp_sock *tp;
 478         struct inet_sock *inet;
 479         const int type = icmp_hdr(skb)->type;
 480         const int code = icmp_hdr(skb)->code;
 481         struct sock *sk;
 482         struct request_sock *fastopen;
 483         u32 seq, snd_una;
 484         int err;
 485         struct net *net = dev_net(skb->dev);
 486
 487         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
 488                                        iph->daddr, th->dest, iph->saddr,
 489                                        ntohs(th->source), inet_iif(skb), 0);
 490         if (!sk) {
 491                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 492                 return -ENOENT;
 493         }
 494         if (sk->sk_state == TCP_TIME_WAIT) {
 495                 inet_twsk_put(inet_twsk(sk));
 496                 return 0;
 497         }
 498         seq = ntohl(th->seq);
 499         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 500                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 501                                      type == ICMP_TIME_EXCEEDED ||
 502                                      (type == ICMP_DEST_UNREACH &&
 503                                       (code == ICMP_NET_UNREACH ||
 504                                        code == ICMP_HOST_UNREACH)));
 505                 return 0;
 506         }
 507
 508         bh_lock_sock(sk);
 509         /* If too many ICMPs get dropped on busy
 510          * servers this needs to be solved differently.
 511          * We do take care of PMTU discovery (RFC1191) special case :
 512          * we can receive locally generated ICMP messages while socket is held.
 513          */
 514         if (sock_owned_by_user(sk)) {
 515                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 516                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 517         }
 518         if (sk->sk_state == TCP_CLOSE)
 519                 goto out;
 520
 521         if (static_branch_unlikely(&ip4_min_ttl)) {
 522                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 523                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 524                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 525                         goto out;
 526                 }
 527         }
 528
 529         tp = tcp_sk(sk);
 530         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 531         fastopen = rcu_dereference(tp->fastopen_rsk);
 532         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 533         if (sk->sk_state != TCP_LISTEN &&
 534             !between(seq, snd_una, tp->snd_nxt)) {
 535                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 536                 goto out;
 537         }
 538
 539         switch (type) {
 540         case ICMP_REDIRECT:
 541                 if (!sock_owned_by_user(sk))
 542                         do_redirect(skb, sk);
 543                 goto out;
 544         case ICMP_SOURCE_QUENCH:
 545                 /* Just silently ignore these. */
 546                 goto out;
 547         case ICMP_PARAMETERPROB:
 548                 err = EPROTO;
 549                 break;
 550         case ICMP_DEST_UNREACH:
 551                 if (code > NR_ICMP_UNREACH)
 552                         goto out;
 553
 554                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 555                         /* We are not interested in TCP_LISTEN and open_requests
 556                          * (SYN-ACKs send out by Linux are always <576bytes so
 557                          * they should go through unfragmented).
 558                          */
 559                         if (sk->sk_state == TCP_LISTEN)
 560                                 goto out;
 561
 562                         WRITE_ONCE(tp->mtu_info, info);
 563                         if (!sock_owned_by_user(sk)) {
 564                                 tcp_v4_mtu_reduced(sk);
 565                         } else {
 566                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 567                                         sock_hold(sk);
 568                         }
 569                         goto out;
 570                 }
 571
 572                 err = icmp_err_convert[code].errno;
 573                 /* check if this ICMP message allows revert of backoff.
 574                  * (see RFC 6069)
 575                  */
 576                 if (!fastopen &&
 577                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 578                         tcp_ld_RTO_revert(sk, seq);
 579                 break;
 580         case ICMP_TIME_EXCEEDED:
 581                 err = EHOSTUNREACH;
 582                 break;
 583         default:
 584                 goto out;
 585         }
 586
 587         switch (sk->sk_state) {
 588         case TCP_SYN_SENT:
 589         case TCP_SYN_RECV:
 590                 /* Only in fast or simultaneous open. If a fast open socket is
 591                  * already accepted it is treated as a connected one below.
 592                  */
 593                 if (fastopen && !fastopen->sk)
 594                         break;
 595
 596                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 597
 598                 if (!sock_owned_by_user(sk)) {
 599                         sk->sk_err = err;
 600
 601                         sk_error_report(sk);
 602
 603                         tcp_done(sk);
 604                 } else {
 605                         sk->sk_err_soft = err;
 606                 }
 607                 goto out;
 608         }
 609
 610         /* If we've already connected we will keep trying
 611          * until we time out, or the user gives up.
 612          *
 613          * rfc1122 4.2.3.9 allows to consider as hard errors
 614          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 615          * but it is obsoleted by pmtu discovery).
 616          *
 617          * Note, that in modern internet, where routing is unreliable
 618          * and in each dark corner broken firewalls sit, sending random
 619          * errors ordered by their masters even this two messages finally lose
 620          * their original sense (even Linux sends invalid PORT_UNREACHs)
 621          *
 622          * Now we are in compliance with RFCs.
 623          *                                                      --ANK (980905)
 624          */
 625
 626         inet = inet_sk(sk);
 627         if (!sock_owned_by_user(sk) && inet->recverr) {
 628                 sk->sk_err = err;
 629                 sk_error_report(sk);
 630         } else  { /* Only an error on timeout */
 631                 sk->sk_err_soft = err;
 632         }
 633
 634 out:
 635         bh_unlock_sock(sk);
 636         sock_put(sk);
 637         return 0;
 638 }
 639
 640 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 641 {
 642         struct tcphdr *th = tcp_hdr(skb);
 643
 644         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 645         skb->csum_start = skb_transport_header(skb) - skb->head;
 646         skb->csum_offset = offsetof(struct tcphdr, check);
 647 }
 648
 649 /* This routine computes an IPv4 TCP checksum. */
 650 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 651 {
 652         const struct inet_sock *inet = inet_sk(sk);
 653
 654         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 655 }
 656 EXPORT_SYMBOL(tcp_v4_send_check);
 657
 658 /*
 659  *      This routine will send an RST to the other tcp.
 660  *
 661  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 662  *                    for reset.
 663  *      Answer: if a packet caused RST, it is not for a socket
 664  *              existing in our system, if it is matched to a socket,
 665  *              it is just duplicate segment or bug in other side's TCP.
 666  *              So that we build reply only basing on parameters
 667  *              arrived with segment.
 668  *      Exception: precedence violation. We do not implement it in any case.
 669  */
 670
 671 #ifdef CONFIG_TCP_MD5SIG
 672 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 673 #else
 674 #define OPTION_BYTES sizeof(__be32)
 675 #endif
 676
 677 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 678 {
 679         const struct tcphdr *th = tcp_hdr(skb);
 680         struct {
 681                 struct tcphdr th;
 682                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 683         } rep;
 684         struct ip_reply_arg arg;
 685 #ifdef CONFIG_TCP_MD5SIG
 686         struct tcp_md5sig_key *key = NULL;
 687         const __u8 *hash_location = NULL;
 688         unsigned char newhash[16];
 689         int genhash;
 690         struct sock *sk1 = NULL;
 691 #endif
 692         u64 transmit_time = 0;
 693         struct sock *ctl_sk;
 694         struct net *net;
 695
 696         /* Never send a reset in response to a reset. */
 697         if (th->rst)
 698                 return;
 699
 700         /* If sk not NULL, it means we did a successful lookup and incoming
 701          * route had to be correct. prequeue might have dropped our dst.
 702          */
 703         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 704                 return;
 705
 706         /* Swap the send and the receive. */
 707         memset(&rep, 0, sizeof(rep));
 708         rep.th.dest   = th->source;
 709         rep.th.source = th->dest;
 710         rep.th.doff   = sizeof(struct tcphdr) / 4;
 711         rep.th.rst    = 1;
 712
 713         if (th->ack) {
 714                 rep.th.seq = th->ack_seq;
 715         } else {
 716                 rep.th.ack = 1;
 717                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 718                                        skb->len - (th->doff << 2));
 719         }
 720
 721         memset(&arg, 0, sizeof(arg));
 722         arg.iov[0].iov_base = (unsigned char *)&rep;
 723         arg.iov[0].iov_len  = sizeof(rep.th);
 724
 725         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 726 #ifdef CONFIG_TCP_MD5SIG
 727         rcu_read_lock();
 728         hash_location = tcp_parse_md5sig_option(th);
 729         if (sk && sk_fullsock(sk)) {
 730                 const union tcp_md5_addr *addr;
 731                 int l3index;
 732
 733                 /* sdif set, means packet ingressed via a device
 734                  * in an L3 domain and inet_iif is set to it.
 735                  */
 736                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 737                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 738                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 739         } else if (hash_location) {
 740                 const union tcp_md5_addr *addr;
 741                 int sdif = tcp_v4_sdif(skb);
 742                 int dif = inet_iif(skb);
 743                 int l3index;
 744
 745                 /*
 746                  * active side is lost. Try to find listening socket through
 747                  * source port, and then find md5 key through listening socket.
 748                  * we are not loose security here:
 749                  * Incoming packet is checked with md5 hash with finding key,
 750                  * no RST generated if md5 hash doesn't match.
 751                  */
 752                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
 753                                              NULL, 0, ip_hdr(skb)->saddr,
 754                                              th->source, ip_hdr(skb)->daddr,
 755                                              ntohs(th->source), dif, sdif);
 756                 /* don't send rst if it can't find key */
 757                 if (!sk1)
 758                         goto out;
 759
 760                 /* sdif set, means packet ingressed via a device
 761                  * in an L3 domain and dif is set to it.
 762                  */
 763                 l3index = sdif ? dif : 0;
 764                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 765                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 766                 if (!key)
 767                         goto out;
 768
 769
 770                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 771                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 772                         goto out;
 773
 774         }
 775
 776         if (key) {
 777                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 778                                    (TCPOPT_NOP << 16) |
 779                                    (TCPOPT_MD5SIG << 8) |
 780                                    TCPOLEN_MD5SIG);
 781                 /* Update length and the length the header thinks exists */
 782                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 783                 rep.th.doff = arg.iov[0].iov_len / 4;
 784
 785                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 786                                      key, ip_hdr(skb)->saddr,
 787                                      ip_hdr(skb)->daddr, &rep.th);
 788         }
 789 #endif
 790         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 791         if (rep.opt[0] == 0) {
 792                 __be32 mrst = mptcp_reset_option(skb);
 793
 794                 if (mrst) {
 795                         rep.opt[0] = mrst;
 796                         arg.iov[0].iov_len += sizeof(mrst);
 797                         rep.th.doff = arg.iov[0].iov_len / 4;
 798                 }
 799         }
 800
 801         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 802                                       ip_hdr(skb)->saddr, /* XXX */
 803                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 804         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 805         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 806
 807         /* When socket is gone, all binding information is lost.
 808          * routing might fail in this case. No choice here, if we choose to force
 809          * input interface, we will misroute in case of asymmetric route.
 810          */
 811         if (sk) {
 812                 arg.bound_dev_if = sk->sk_bound_dev_if;
 813                 if (sk_fullsock(sk))
 814                         trace_tcp_send_reset(sk, skb);
 815         }
 816
 817         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 818                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 819
 820         arg.tos = ip_hdr(skb)->tos;
 821         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 822         local_bh_disable();
 823         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 824         sock_net_set(ctl_sk, net);
 825         if (sk) {
 826                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 827                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 828                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 829                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 830                 transmit_time = tcp_transmit_time(sk);
 831                 xfrm_sk_clone_policy(ctl_sk, sk);
 832         } else {
 833                 ctl_sk->sk_mark = 0;
 834                 ctl_sk->sk_priority = 0;
 835         }
 836         ip_send_unicast_reply(ctl_sk,
 837                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 838                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 839                               &arg, arg.iov[0].iov_len,
 840                               transmit_time);
 841
 842         xfrm_sk_free_policy(ctl_sk);
 843         sock_net_set(ctl_sk, &init_net);
 844         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 845         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 846         local_bh_enable();
 847
 848 #ifdef CONFIG_TCP_MD5SIG
 849 out:
 850         rcu_read_unlock();
 851 #endif
 852 }
 853
 854 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 855    outside socket context is ugly, certainly. What can I do?
 856  */
 857
 858 static void tcp_v4_send_ack(const struct sock *sk,
 859                             struct sk_buff *skb, u32 seq, u32 ack,
 860                             u32 win, u32 tsval, u32 tsecr, int oif,
 861                             struct tcp_md5sig_key *key,
 862                             int reply_flags, u8 tos)
 863 {
 864         const struct tcphdr *th = tcp_hdr(skb);
 865         struct {
 866                 struct tcphdr th;
 867                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 868 #ifdef CONFIG_TCP_MD5SIG
 869                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 870 #endif
 871                         ];
 872         } rep;
 873         struct net *net = sock_net(sk);
 874         struct ip_reply_arg arg;
 875         struct sock *ctl_sk;
 876         u64 transmit_time;
 877
 878         memset(&rep.th, 0, sizeof(struct tcphdr));
 879         memset(&arg, 0, sizeof(arg));
 880
 881         arg.iov[0].iov_base = (unsigned char *)&rep;
 882         arg.iov[0].iov_len  = sizeof(rep.th);
 883         if (tsecr) {
 884                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 885                                    (TCPOPT_TIMESTAMP << 8) |
 886                                    TCPOLEN_TIMESTAMP);
 887                 rep.opt[1] = htonl(tsval);
 888                 rep.opt[2] = htonl(tsecr);
 889                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 890         }
 891
 892         /* Swap the send and the receive. */
 893         rep.th.dest    = th->source;
 894         rep.th.source  = th->dest;
 895         rep.th.doff    = arg.iov[0].iov_len / 4;
 896         rep.th.seq     = htonl(seq);
 897         rep.th.ack_seq = htonl(ack);
 898         rep.th.ack     = 1;
 899         rep.th.window  = htons(win);
 900
 901 #ifdef CONFIG_TCP_MD5SIG
 902         if (key) {
 903                 int offset = (tsecr) ? 3 : 0;
 904
 905                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 906                                           (TCPOPT_NOP << 16) |
 907                                           (TCPOPT_MD5SIG << 8) |
 908                                           TCPOLEN_MD5SIG);
 909                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 910                 rep.th.doff = arg.iov[0].iov_len/4;
 911
 912                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 913                                     key, ip_hdr(skb)->saddr,
 914                                     ip_hdr(skb)->daddr, &rep.th);
 915         }
 916 #endif
 917         arg.flags = reply_flags;
 918         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 919                                       ip_hdr(skb)->saddr, /* XXX */
 920                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 921         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 922         if (oif)
 923                 arg.bound_dev_if = oif;
 924         arg.tos = tos;
 925         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 926         local_bh_disable();
 927         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 928         sock_net_set(ctl_sk, net);
 929         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 930                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 931         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 932                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 933         transmit_time = tcp_transmit_time(sk);
 934         ip_send_unicast_reply(ctl_sk,
 935                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 936                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 937                               &arg, arg.iov[0].iov_len,
 938                               transmit_time);
 939
 940         sock_net_set(ctl_sk, &init_net);
 941         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 942         local_bh_enable();
 943 }
 944
 945 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 946 {
 947         struct inet_timewait_sock *tw = inet_twsk(sk);
 948         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 949
 950         tcp_v4_send_ack(sk, skb,
 951                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 952                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 953                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 954                         tcptw->tw_ts_recent,
 955                         tw->tw_bound_dev_if,
 956                         tcp_twsk_md5_key(tcptw),
 957                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 958                         tw->tw_tos
 959                         );
 960
 961         inet_twsk_put(tw);
 962 }
 963
 964 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 965                                   struct request_sock *req)
 966 {
 967         const union tcp_md5_addr *addr;
 968         int l3index;
 969
 970         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 971          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 972          */
 973         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 974                                              tcp_sk(sk)->snd_nxt;
 975
 976         /* RFC 7323 2.3
 977          * The window field (SEG.WND) of every outgoing segment, with the
 978          * exception of <SYN> segments, MUST be right-shifted by
 979          * Rcv.Wind.Shift bits:
 980          */
 981         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 982         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 983         tcp_v4_send_ack(sk, skb, seq,
 984                         tcp_rsk(req)->rcv_nxt,
 985                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 986                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 987                         req->ts_recent,
 988                         0,
 989                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 990                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 991                         ip_hdr(skb)->tos);
 992 }
 993
 994 /*
 995  *      Send a SYN-ACK after having received a SYN.
 996  *      This still operates on a request_sock only, not on a big
 997  *      socket.
 998  */
 999 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1000                               struct flowi *fl,
1001                               struct request_sock *req,
1002                               struct tcp_fastopen_cookie *foc,
1003                               enum tcp_synack_type synack_type,
1004                               struct sk_buff *syn_skb)
1005 {
1006         const struct inet_request_sock *ireq = inet_rsk(req);
1007         struct flowi4 fl4;
1008         int err = -1;
1009         struct sk_buff *skb;
1010         u8 tos;
1011
1012         /* First, grab a route. */
1013         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1014                 return -1;
1015
1016         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1017
1018         if (skb) {
1019                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1020
1021                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1022                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1023                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1024                                 inet_sk(sk)->tos;
1025
1026                 if (!INET_ECN_is_capable(tos) &&
1027                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1028                         tos |= INET_ECN_ECT_0;
1029
1030                 rcu_read_lock();
1031                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1032                                             ireq->ir_rmt_addr,
1033                                             rcu_dereference(ireq->ireq_opt),
1034                                             tos);
1035                 rcu_read_unlock();
1036                 err = net_xmit_eval(err);
1037         }
1038
1039         return err;
1040 }
1041
1042 /*
1043  *      IPv4 request_sock destructor.
1044  */
1045 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1046 {
1047         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1048 }
1049
1050 #ifdef CONFIG_TCP_MD5SIG
1051 /*
1052  * RFC2385 MD5 checksumming requires a mapping of
1053  * IP address->MD5 Key.
1054  * We need to maintain these in the sk structure.
1055  */
1056
1057 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1058 EXPORT_SYMBOL(tcp_md5_needed);
1059
1060 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1061 {
1062         if (!old)
1063                 return true;
1064
1065         /* l3index always overrides non-l3index */
1066         if (old->l3index && new->l3index == 0)
1067                 return false;
1068         if (old->l3index == 0 && new->l3index)
1069                 return true;
1070
1071         return old->prefixlen < new->prefixlen;
1072 }
1073
1074 /* Find the Key structure for an address.  */
1075 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1076                                            const union tcp_md5_addr *addr,
1077                                            int family)
1078 {
1079         const struct tcp_sock *tp = tcp_sk(sk);
1080         struct tcp_md5sig_key *key;
1081         const struct tcp_md5sig_info *md5sig;
1082         __be32 mask;
1083         struct tcp_md5sig_key *best_match = NULL;
1084         bool match;
1085
1086         /* caller either holds rcu_read_lock() or socket lock */
1087         md5sig = rcu_dereference_check(tp->md5sig_info,
1088                                        lockdep_sock_is_held(sk));
1089         if (!md5sig)
1090                 return NULL;
1091
1092         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1093                                  lockdep_sock_is_held(sk)) {
1094                 if (key->family != family)
1095                         continue;
1096                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1097                         continue;
1098                 if (family == AF_INET) {
1099                         mask = inet_make_mask(key->prefixlen);
1100                         match = (key->addr.a4.s_addr & mask) ==
1101                                 (addr->a4.s_addr & mask);
1102 #if IS_ENABLED(CONFIG_IPV6)
1103                 } else if (family == AF_INET6) {
1104                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1105                                                   key->prefixlen);
1106 #endif
1107                 } else {
1108                         match = false;
1109                 }
1110
1111                 if (match && better_md5_match(best_match, key))
1112                         best_match = key;
1113         }
1114         return best_match;
1115 }
1116 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1117
1118 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1119                                                       const union tcp_md5_addr *addr,
1120                                                       int family, u8 prefixlen,
1121                                                       int l3index, u8 flags)
1122 {
1123         const struct tcp_sock *tp = tcp_sk(sk);
1124         struct tcp_md5sig_key *key;
1125         unsigned int size = sizeof(struct in_addr);
1126         const struct tcp_md5sig_info *md5sig;
1127
1128         /* caller either holds rcu_read_lock() or socket lock */
1129         md5sig = rcu_dereference_check(tp->md5sig_info,
1130                                        lockdep_sock_is_held(sk));
1131         if (!md5sig)
1132                 return NULL;
1133 #if IS_ENABLED(CONFIG_IPV6)
1134         if (family == AF_INET6)
1135                 size = sizeof(struct in6_addr);
1136 #endif
1137         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1138                                  lockdep_sock_is_held(sk)) {
1139                 if (key->family != family)
1140                         continue;
1141                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1142                         continue;
1143                 if (key->l3index != l3index)
1144                         continue;
1145                 if (!memcmp(&key->addr, addr, size) &&
1146                     key->prefixlen == prefixlen)
1147                         return key;
1148         }
1149         return NULL;
1150 }
1151
1152 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1153                                          const struct sock *addr_sk)
1154 {
1155         const union tcp_md5_addr *addr;
1156         int l3index;
1157
1158         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1159                                                  addr_sk->sk_bound_dev_if);
1160         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1161         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1162 }
1163 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1164
1165 /* This can be called on a newly created socket, from other files */
1166 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1167                    int family, u8 prefixlen, int l3index, u8 flags,
1168                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1169 {
1170         /* Add Key to the list */
1171         struct tcp_md5sig_key *key;
1172         struct tcp_sock *tp = tcp_sk(sk);
1173         struct tcp_md5sig_info *md5sig;
1174
1175         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1176         if (key) {
1177                 /* Pre-existing entry - just update that one.
1178                  * Note that the key might be used concurrently.
1179                  * data_race() is telling kcsan that we do not care of
1180                  * key mismatches, since changing MD5 key on live flows
1181                  * can lead to packet drops.
1182                  */
1183                 data_race(memcpy(key->key, newkey, newkeylen));
1184
1185                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1186                  * Also note that a reader could catch new key->keylen value
1187                  * but old key->key[], this is the reason we use __GFP_ZERO
1188                  * at sock_kmalloc() time below these lines.
1189                  */
1190                 WRITE_ONCE(key->keylen, newkeylen);
1191
1192                 return 0;
1193         }
1194
1195         md5sig = rcu_dereference_protected(tp->md5sig_info,
1196                                            lockdep_sock_is_held(sk));
1197         if (!md5sig) {
1198                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1199                 if (!md5sig)
1200                         return -ENOMEM;
1201
1202                 sk_gso_disable(sk);
1203                 INIT_HLIST_HEAD(&md5sig->head);
1204                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1205         }
1206
1207         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1208         if (!key)
1209                 return -ENOMEM;
1210         if (!tcp_alloc_md5sig_pool()) {
1211                 sock_kfree_s(sk, key, sizeof(*key));
1212                 return -ENOMEM;
1213         }
1214
1215         memcpy(key->key, newkey, newkeylen);
1216         key->keylen = newkeylen;
1217         key->family = family;
1218         key->prefixlen = prefixlen;
1219         key->l3index = l3index;
1220         key->flags = flags;
1221         memcpy(&key->addr, addr,
1222                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1223                                                                  sizeof(struct in_addr));
1224         hlist_add_head_rcu(&key->node, &md5sig->head);
1225         return 0;
1226 }
1227 EXPORT_SYMBOL(tcp_md5_do_add);
1228
1229 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1230                    u8 prefixlen, int l3index, u8 flags)
1231 {
1232         struct tcp_md5sig_key *key;
1233
1234         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1235         if (!key)
1236                 return -ENOENT;
1237         hlist_del_rcu(&key->node);
1238         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1239         kfree_rcu(key, rcu);
1240         return 0;
1241 }
1242 EXPORT_SYMBOL(tcp_md5_do_del);
1243
1244 static void tcp_clear_md5_list(struct sock *sk)
1245 {
1246         struct tcp_sock *tp = tcp_sk(sk);
1247         struct tcp_md5sig_key *key;
1248         struct hlist_node *n;
1249         struct tcp_md5sig_info *md5sig;
1250
1251         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1252
1253         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1254                 hlist_del_rcu(&key->node);
1255                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1256                 kfree_rcu(key, rcu);
1257         }
1258 }
1259
1260 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1261                                  sockptr_t optval, int optlen)
1262 {
1263         struct tcp_md5sig cmd;
1264         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1265         const union tcp_md5_addr *addr;
1266         u8 prefixlen = 32;
1267         int l3index = 0;
1268         u8 flags;
1269
1270         if (optlen < sizeof(cmd))
1271                 return -EINVAL;
1272
1273         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1274                 return -EFAULT;
1275
1276         if (sin->sin_family != AF_INET)
1277                 return -EINVAL;
1278
1279         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1280
1281         if (optname == TCP_MD5SIG_EXT &&
1282             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1283                 prefixlen = cmd.tcpm_prefixlen;
1284                 if (prefixlen > 32)
1285                         return -EINVAL;
1286         }
1287
1288         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1289             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1290                 struct net_device *dev;
1291
1292                 rcu_read_lock();
1293                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1294                 if (dev && netif_is_l3_master(dev))
1295                         l3index = dev->ifindex;
1296
1297                 rcu_read_unlock();
1298
1299                 /* ok to reference set/not set outside of rcu;
1300                  * right now device MUST be an L3 master
1301                  */
1302                 if (!dev || !l3index)
1303                         return -EINVAL;
1304         }
1305
1306         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1307
1308         if (!cmd.tcpm_keylen)
1309                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1310
1311         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1312                 return -EINVAL;
1313
1314         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1315                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1316 }
1317
1318 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1319                                    __be32 daddr, __be32 saddr,
1320                                    const struct tcphdr *th, int nbytes)
1321 {
1322         struct tcp4_pseudohdr *bp;
1323         struct scatterlist sg;
1324         struct tcphdr *_th;
1325
1326         bp = hp->scratch;
1327         bp->saddr = saddr;
1328         bp->daddr = daddr;
1329         bp->pad = 0;
1330         bp->protocol = IPPROTO_TCP;
1331         bp->len = cpu_to_be16(nbytes);
1332
1333         _th = (struct tcphdr *)(bp + 1);
1334         memcpy(_th, th, sizeof(*th));
1335         _th->check = 0;
1336
1337         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1338         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1339                                 sizeof(*bp) + sizeof(*th));
1340         return crypto_ahash_update(hp->md5_req);
1341 }
1342
1343 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1344                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1345 {
1346         struct tcp_md5sig_pool *hp;
1347         struct ahash_request *req;
1348
1349         hp = tcp_get_md5sig_pool();
1350         if (!hp)
1351                 goto clear_hash_noput;
1352         req = hp->md5_req;
1353
1354         if (crypto_ahash_init(req))
1355                 goto clear_hash;
1356         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1357                 goto clear_hash;
1358         if (tcp_md5_hash_key(hp, key))
1359                 goto clear_hash;
1360         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1361         if (crypto_ahash_final(req))
1362                 goto clear_hash;
1363
1364         tcp_put_md5sig_pool();
1365         return 0;
1366
1367 clear_hash:
1368         tcp_put_md5sig_pool();
1369 clear_hash_noput:
1370         memset(md5_hash, 0, 16);
1371         return 1;
1372 }
1373
1374 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1375                         const struct sock *sk,
1376                         const struct sk_buff *skb)
1377 {
1378         struct tcp_md5sig_pool *hp;
1379         struct ahash_request *req;
1380         const struct tcphdr *th = tcp_hdr(skb);
1381         __be32 saddr, daddr;
1382
1383         if (sk) { /* valid for establish/request sockets */
1384                 saddr = sk->sk_rcv_saddr;
1385                 daddr = sk->sk_daddr;
1386         } else {
1387                 const struct iphdr *iph = ip_hdr(skb);
1388                 saddr = iph->saddr;
1389                 daddr = iph->daddr;
1390         }
1391
1392         hp = tcp_get_md5sig_pool();
1393         if (!hp)
1394                 goto clear_hash_noput;
1395         req = hp->md5_req;
1396
1397         if (crypto_ahash_init(req))
1398                 goto clear_hash;
1399
1400         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1401                 goto clear_hash;
1402         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1403                 goto clear_hash;
1404         if (tcp_md5_hash_key(hp, key))
1405                 goto clear_hash;
1406         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1407         if (crypto_ahash_final(req))
1408                 goto clear_hash;
1409
1410         tcp_put_md5sig_pool();
1411         return 0;
1412
1413 clear_hash:
1414         tcp_put_md5sig_pool();
1415 clear_hash_noput:
1416         memset(md5_hash, 0, 16);
1417         return 1;
1418 }
1419 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1420
1421 #endif
1422
1423 static void tcp_v4_init_req(struct request_sock *req,
1424                             const struct sock *sk_listener,
1425                             struct sk_buff *skb)
1426 {
1427         struct inet_request_sock *ireq = inet_rsk(req);
1428         struct net *net = sock_net(sk_listener);
1429
1430         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1431         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1432         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1433 }
1434
1435 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1436                                           struct sk_buff *skb,
1437                                           struct flowi *fl,
1438                                           struct request_sock *req)
1439 {
1440         tcp_v4_init_req(req, sk, skb);
1441
1442         if (security_inet_conn_request(sk, skb, req))
1443                 return NULL;
1444
1445         return inet_csk_route_req(sk, &fl->u.ip4, req);
1446 }
1447
1448 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1449         .family         =       PF_INET,
1450         .obj_size       =       sizeof(struct tcp_request_sock),
1451         .rtx_syn_ack    =       tcp_rtx_synack,
1452         .send_ack       =       tcp_v4_reqsk_send_ack,
1453         .destructor     =       tcp_v4_reqsk_destructor,
1454         .send_reset     =       tcp_v4_send_reset,
1455         .syn_ack_timeout =      tcp_syn_ack_timeout,
1456 };
1457
1458 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1459         .mss_clamp      =       TCP_MSS_DEFAULT,
1460 #ifdef CONFIG_TCP_MD5SIG
1461         .req_md5_lookup =       tcp_v4_md5_lookup,
1462         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1463 #endif
1464 #ifdef CONFIG_SYN_COOKIES
1465         .cookie_init_seq =      cookie_v4_init_sequence,
1466 #endif
1467         .route_req      =       tcp_v4_route_req,
1468         .init_seq       =       tcp_v4_init_seq,
1469         .init_ts_off    =       tcp_v4_init_ts_off,
1470         .send_synack    =       tcp_v4_send_synack,
1471 };
1472
1473 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1474 {
1475         /* Never answer to SYNs send to broadcast or multicast */
1476         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1477                 goto drop;
1478
1479         return tcp_conn_request(&tcp_request_sock_ops,
1480                                 &tcp_request_sock_ipv4_ops, sk, skb);
1481
1482 drop:
1483         tcp_listendrop(sk);
1484         return 0;
1485 }
1486 EXPORT_SYMBOL(tcp_v4_conn_request);
1487
1488
1489 /*
1490  * The three way handshake has completed - we got a valid synack -
1491  * now create the new socket.
1492  */
1493 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1494                                   struct request_sock *req,
1495                                   struct dst_entry *dst,
1496                                   struct request_sock *req_unhash,
1497                                   bool *own_req)
1498 {
1499         struct inet_request_sock *ireq;
1500         bool found_dup_sk = false;
1501         struct inet_sock *newinet;
1502         struct tcp_sock *newtp;
1503         struct sock *newsk;
1504 #ifdef CONFIG_TCP_MD5SIG
1505         const union tcp_md5_addr *addr;
1506         struct tcp_md5sig_key *key;
1507         int l3index;
1508 #endif
1509         struct ip_options_rcu *inet_opt;
1510
1511         if (sk_acceptq_is_full(sk))
1512                 goto exit_overflow;
1513
1514         newsk = tcp_create_openreq_child(sk, req, skb);
1515         if (!newsk)
1516                 goto exit_nonewsk;
1517
1518         newsk->sk_gso_type = SKB_GSO_TCPV4;
1519         inet_sk_rx_dst_set(newsk, skb);
1520
1521         newtp                 = tcp_sk(newsk);
1522         newinet               = inet_sk(newsk);
1523         ireq                  = inet_rsk(req);
1524         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1525         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1526         newsk->sk_bound_dev_if = ireq->ir_iif;
1527         newinet->inet_saddr   = ireq->ir_loc_addr;
1528         inet_opt              = rcu_dereference(ireq->ireq_opt);
1529         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1530         newinet->mc_index     = inet_iif(skb);
1531         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1532         newinet->rcv_tos      = ip_hdr(skb)->tos;
1533         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1534         if (inet_opt)
1535                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1536         newinet->inet_id = get_random_u16();
1537
1538         /* Set ToS of the new socket based upon the value of incoming SYN.
1539          * ECT bits are set later in tcp_init_transfer().
1540          */
1541         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1542                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1543
1544         if (!dst) {
1545                 dst = inet_csk_route_child_sock(sk, newsk, req);
1546                 if (!dst)
1547                         goto put_and_exit;
1548         } else {
1549                 /* syncookie case : see end of cookie_v4_check() */
1550         }
1551         sk_setup_caps(newsk, dst);
1552
1553         tcp_ca_openreq_child(newsk, dst);
1554
1555         tcp_sync_mss(newsk, dst_mtu(dst));
1556         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1557
1558         tcp_initialize_rcv_mss(newsk);
1559
1560 #ifdef CONFIG_TCP_MD5SIG
1561         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1562         /* Copy over the MD5 key from the original socket */
1563         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1564         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1565         if (key) {
1566                 /*
1567                  * We're using one, so create a matching key
1568                  * on the newsk structure. If we fail to get
1569                  * memory, then we end up not copying the key
1570                  * across. Shucks.
1571                  */
1572                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1573                                key->key, key->keylen, GFP_ATOMIC);
1574                 sk_gso_disable(newsk);
1575         }
1576 #endif
1577
1578         if (__inet_inherit_port(sk, newsk) < 0)
1579                 goto put_and_exit;
1580         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1581                                        &found_dup_sk);
1582         if (likely(*own_req)) {
1583                 tcp_move_syn(newtp, req);
1584                 ireq->ireq_opt = NULL;
1585         } else {
1586                 newinet->inet_opt = NULL;
1587
1588                 if (!req_unhash && found_dup_sk) {
1589                         /* This code path should only be executed in the
1590                          * syncookie case only
1591                          */
1592                         bh_unlock_sock(newsk);
1593                         sock_put(newsk);
1594                         newsk = NULL;
1595                 }
1596         }
1597         return newsk;
1598
1599 exit_overflow:
1600         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1601 exit_nonewsk:
1602         dst_release(dst);
1603 exit:
1604         tcp_listendrop(sk);
1605         return NULL;
1606 put_and_exit:
1607         newinet->inet_opt = NULL;
1608         inet_csk_prepare_forced_close(newsk);
1609         tcp_done(newsk);
1610         goto exit;
1611 }
1612 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1613
1614 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1615 {
1616 #ifdef CONFIG_SYN_COOKIES
1617         const struct tcphdr *th = tcp_hdr(skb);
1618
1619         if (!th->syn)
1620                 sk = cookie_v4_check(sk, skb);
1621 #endif
1622         return sk;
1623 }
1624
1625 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1626                          struct tcphdr *th, u32 *cookie)
1627 {
1628         u16 mss = 0;
1629 #ifdef CONFIG_SYN_COOKIES
1630         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1631                                     &tcp_request_sock_ipv4_ops, sk, th);
1632         if (mss) {
1633                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1634                 tcp_synq_overflow(sk);
1635         }
1636 #endif
1637         return mss;
1638 }
1639
1640 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1641                                                            u32));
1642 /* The socket must have it's spinlock held when we get
1643  * here, unless it is a TCP_LISTEN socket.
1644  *
1645  * We have a potential double-lock case here, so even when
1646  * doing backlog processing we use the BH locking scheme.
1647  * This is because we cannot sleep with the original spinlock
1648  * held.
1649  */
1650 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1651 {
1652         enum skb_drop_reason reason;
1653         struct sock *rsk;
1654
1655         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1656                 struct dst_entry *dst;
1657
1658                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1659                                                 lockdep_sock_is_held(sk));
1660
1661                 sock_rps_save_rxhash(sk, skb);
1662                 sk_mark_napi_id(sk, skb);
1663                 if (dst) {
1664                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1665                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1666                                              dst, 0)) {
1667                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1668                                 dst_release(dst);
1669                         }
1670                 }
1671                 tcp_rcv_established(sk, skb);
1672                 return 0;
1673         }
1674
1675         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1676         if (tcp_checksum_complete(skb))
1677                 goto csum_err;
1678
1679         if (sk->sk_state == TCP_LISTEN) {
1680                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1681
1682                 if (!nsk)
1683                         goto discard;
1684                 if (nsk != sk) {
1685                         if (tcp_child_process(sk, nsk, skb)) {
1686                                 rsk = nsk;
1687                                 goto reset;
1688                         }
1689                         return 0;
1690                 }
1691         } else
1692                 sock_rps_save_rxhash(sk, skb);
1693
1694         if (tcp_rcv_state_process(sk, skb)) {
1695                 rsk = sk;
1696                 goto reset;
1697         }
1698         return 0;
1699
1700 reset:
1701         tcp_v4_send_reset(rsk, skb);
1702 discard:
1703         kfree_skb_reason(skb, reason);
1704         /* Be careful here. If this function gets more complicated and
1705          * gcc suffers from register pressure on the x86, sk (in %ebx)
1706          * might be destroyed here. This current version compiles correctly,
1707          * but you have been warned.
1708          */
1709         return 0;
1710
1711 csum_err:
1712         reason = SKB_DROP_REASON_TCP_CSUM;
1713         trace_tcp_bad_csum(skb);
1714         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1715         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1716         goto discard;
1717 }
1718 EXPORT_SYMBOL(tcp_v4_do_rcv);
1719
1720 int tcp_v4_early_demux(struct sk_buff *skb)
1721 {
1722         struct net *net = dev_net(skb->dev);
1723         const struct iphdr *iph;
1724         const struct tcphdr *th;
1725         struct sock *sk;
1726
1727         if (skb->pkt_type != PACKET_HOST)
1728                 return 0;
1729
1730         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1731                 return 0;
1732
1733         iph = ip_hdr(skb);
1734         th = tcp_hdr(skb);
1735
1736         if (th->doff < sizeof(struct tcphdr) / 4)
1737                 return 0;
1738
1739         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1740                                        iph->saddr, th->source,
1741                                        iph->daddr, ntohs(th->dest),
1742                                        skb->skb_iif, inet_sdif(skb));
1743         if (sk) {
1744                 skb->sk = sk;
1745                 skb->destructor = sock_edemux;
1746                 if (sk_fullsock(sk)) {
1747                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1748
1749                         if (dst)
1750                                 dst = dst_check(dst, 0);
1751                         if (dst &&
1752                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1753                                 skb_dst_set_noref(skb, dst);
1754                 }
1755         }
1756         return 0;
1757 }
1758
1759 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1760                      enum skb_drop_reason *reason)
1761 {
1762         u32 limit, tail_gso_size, tail_gso_segs;
1763         struct skb_shared_info *shinfo;
1764         const struct tcphdr *th;
1765         struct tcphdr *thtail;
1766         struct sk_buff *tail;
1767         unsigned int hdrlen;
1768         bool fragstolen;
1769         u32 gso_segs;
1770         u32 gso_size;
1771         int delta;
1772
1773         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1774          * we can fix skb->truesize to its real value to avoid future drops.
1775          * This is valid because skb is not yet charged to the socket.
1776          * It has been noticed pure SACK packets were sometimes dropped
1777          * (if cooked by drivers without copybreak feature).
1778          */
1779         skb_condense(skb);
1780
1781         skb_dst_drop(skb);
1782
1783         if (unlikely(tcp_checksum_complete(skb))) {
1784                 bh_unlock_sock(sk);
1785                 trace_tcp_bad_csum(skb);
1786                 *reason = SKB_DROP_REASON_TCP_CSUM;
1787                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1788                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1789                 return true;
1790         }
1791
1792         /* Attempt coalescing to last skb in backlog, even if we are
1793          * above the limits.
1794          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1795          */
1796         th = (const struct tcphdr *)skb->data;
1797         hdrlen = th->doff * 4;
1798
1799         tail = sk->sk_backlog.tail;
1800         if (!tail)
1801                 goto no_coalesce;
1802         thtail = (struct tcphdr *)tail->data;
1803
1804         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1805             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1806             ((TCP_SKB_CB(tail)->tcp_flags |
1807               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1808             !((TCP_SKB_CB(tail)->tcp_flags &
1809               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1810             ((TCP_SKB_CB(tail)->tcp_flags ^
1811               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1812 #ifdef CONFIG_TLS_DEVICE
1813             tail->decrypted != skb->decrypted ||
1814 #endif
1815             thtail->doff != th->doff ||
1816             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1817                 goto no_coalesce;
1818
1819         __skb_pull(skb, hdrlen);
1820
1821         shinfo = skb_shinfo(skb);
1822         gso_size = shinfo->gso_size ?: skb->len;
1823         gso_segs = shinfo->gso_segs ?: 1;
1824
1825         shinfo = skb_shinfo(tail);
1826         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1827         tail_gso_segs = shinfo->gso_segs ?: 1;
1828
1829         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1830                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1831
1832                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1833                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1834                         thtail->window = th->window;
1835                 }
1836
1837                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1838                  * thtail->fin, so that the fast path in tcp_rcv_established()
1839                  * is not entered if we append a packet with a FIN.
1840                  * SYN, RST, URG are not present.
1841                  * ACK is set on both packets.
1842                  * PSH : we do not really care in TCP stack,
1843                  *       at least for 'GRO' packets.
1844                  */
1845                 thtail->fin |= th->fin;
1846                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1847
1848                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1849                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1850                         tail->tstamp = skb->tstamp;
1851                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1852                 }
1853
1854                 /* Not as strict as GRO. We only need to carry mss max value */
1855                 shinfo->gso_size = max(gso_size, tail_gso_size);
1856                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1857
1858                 sk->sk_backlog.len += delta;
1859                 __NET_INC_STATS(sock_net(sk),
1860                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1861                 kfree_skb_partial(skb, fragstolen);
1862                 return false;
1863         }
1864         __skb_push(skb, hdrlen);
1865
1866 no_coalesce:
1867         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1868
1869         /* Only socket owner can try to collapse/prune rx queues
1870          * to reduce memory overhead, so add a little headroom here.
1871          * Few sockets backlog are possibly concurrently non empty.
1872          */
1873         limit += 64 * 1024;
1874
1875         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1876                 bh_unlock_sock(sk);
1877                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1878                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1879                 return true;
1880         }
1881         return false;
1882 }
1883 EXPORT_SYMBOL(tcp_add_backlog);
1884
1885 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1886 {
1887         struct tcphdr *th = (struct tcphdr *)skb->data;
1888
1889         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1890 }
1891 EXPORT_SYMBOL(tcp_filter);
1892
1893 static void tcp_v4_restore_cb(struct sk_buff *skb)
1894 {
1895         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1896                 sizeof(struct inet_skb_parm));
1897 }
1898
1899 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1900                            const struct tcphdr *th)
1901 {
1902         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1903          * barrier() makes sure compiler wont play fool^Waliasing games.
1904          */
1905         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1906                 sizeof(struct inet_skb_parm));
1907         barrier();
1908
1909         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1910         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1911                                     skb->len - th->doff * 4);
1912         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1913         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1914         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1915         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1916         TCP_SKB_CB(skb)->sacked  = 0;
1917         TCP_SKB_CB(skb)->has_rxtstamp =
1918                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1919 }
1920
1921 /*
1922  *      From tcp_input.c
1923  */
1924
1925 int tcp_v4_rcv(struct sk_buff *skb)
1926 {
1927         struct net *net = dev_net(skb->dev);
1928         enum skb_drop_reason drop_reason;
1929         int sdif = inet_sdif(skb);
1930         int dif = inet_iif(skb);
1931         const struct iphdr *iph;
1932         const struct tcphdr *th;
1933         bool refcounted;
1934         struct sock *sk;
1935         int ret;
1936
1937         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1938         if (skb->pkt_type != PACKET_HOST)
1939                 goto discard_it;
1940
1941         /* Count it even if it's bad */
1942         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1943
1944         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1945                 goto discard_it;
1946
1947         th = (const struct tcphdr *)skb->data;
1948
1949         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1950                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1951                 goto bad_packet;
1952         }
1953         if (!pskb_may_pull(skb, th->doff * 4))
1954                 goto discard_it;
1955
1956         /* An explanation is required here, I think.
1957          * Packet length and doff are validated by header prediction,
1958          * provided case of th->doff==0 is eliminated.
1959          * So, we defer the checks. */
1960
1961         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1962                 goto csum_error;
1963
1964         th = (const struct tcphdr *)skb->data;
1965         iph = ip_hdr(skb);
1966 lookup:
1967         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1968                                skb, __tcp_hdrlen(th), th->source,
1969                                th->dest, sdif, &refcounted);
1970         if (!sk)
1971                 goto no_tcp_socket;
1972
1973 process:
1974         if (sk->sk_state == TCP_TIME_WAIT)
1975                 goto do_time_wait;
1976
1977         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1978                 struct request_sock *req = inet_reqsk(sk);
1979                 bool req_stolen = false;
1980                 struct sock *nsk;
1981
1982                 sk = req->rsk_listener;
1983                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1984                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1985                 else
1986                         drop_reason = tcp_inbound_md5_hash(sk, skb,
1987                                                    &iph->saddr, &iph->daddr,
1988                                                    AF_INET, dif, sdif);
1989                 if (unlikely(drop_reason)) {
1990                         sk_drops_add(sk, skb);
1991                         reqsk_put(req);
1992                         goto discard_it;
1993                 }
1994                 if (tcp_checksum_complete(skb)) {
1995                         reqsk_put(req);
1996                         goto csum_error;
1997                 }
1998                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1999                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2000                         if (!nsk) {
2001                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2002                                 goto lookup;
2003                         }
2004                         sk = nsk;
2005                         /* reuseport_migrate_sock() has already held one sk_refcnt
2006                          * before returning.
2007                          */
2008                 } else {
2009                         /* We own a reference on the listener, increase it again
2010                          * as we might lose it too soon.
2011                          */
2012                         sock_hold(sk);
2013                 }
2014                 refcounted = true;
2015                 nsk = NULL;
2016                 if (!tcp_filter(sk, skb)) {
2017                         th = (const struct tcphdr *)skb->data;
2018                         iph = ip_hdr(skb);
2019                         tcp_v4_fill_cb(skb, iph, th);
2020                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2021                 } else {
2022                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2023                 }
2024                 if (!nsk) {
2025                         reqsk_put(req);
2026                         if (req_stolen) {
2027                                 /* Another cpu got exclusive access to req
2028                                  * and created a full blown socket.
2029                                  * Try to feed this packet to this socket
2030                                  * instead of discarding it.
2031                                  */
2032                                 tcp_v4_restore_cb(skb);
2033                                 sock_put(sk);
2034                                 goto lookup;
2035                         }
2036                         goto discard_and_relse;
2037                 }
2038                 nf_reset_ct(skb);
2039                 if (nsk == sk) {
2040                         reqsk_put(req);
2041                         tcp_v4_restore_cb(skb);
2042                 } else if (tcp_child_process(sk, nsk, skb)) {
2043                         tcp_v4_send_reset(nsk, skb);
2044                         goto discard_and_relse;
2045                 } else {
2046                         sock_put(sk);
2047                         return 0;
2048                 }
2049         }
2050
2051         if (static_branch_unlikely(&ip4_min_ttl)) {
2052                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2053                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2054                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2055                         goto discard_and_relse;
2056                 }
2057         }
2058
2059         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2060                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2061                 goto discard_and_relse;
2062         }
2063
2064         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2065                                            &iph->daddr, AF_INET, dif, sdif);
2066         if (drop_reason)
2067                 goto discard_and_relse;
2068
2069         nf_reset_ct(skb);
2070
2071         if (tcp_filter(sk, skb)) {
2072                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2073                 goto discard_and_relse;
2074         }
2075         th = (const struct tcphdr *)skb->data;
2076         iph = ip_hdr(skb);
2077         tcp_v4_fill_cb(skb, iph, th);
2078
2079         skb->dev = NULL;
2080
2081         if (sk->sk_state == TCP_LISTEN) {
2082                 ret = tcp_v4_do_rcv(sk, skb);
2083                 goto put_and_return;
2084         }
2085
2086         sk_incoming_cpu_update(sk);
2087
2088         bh_lock_sock_nested(sk);
2089         tcp_segs_in(tcp_sk(sk), skb);
2090         ret = 0;
2091         if (!sock_owned_by_user(sk)) {
2092                 ret = tcp_v4_do_rcv(sk, skb);
2093         } else {
2094                 if (tcp_add_backlog(sk, skb, &drop_reason))
2095                         goto discard_and_relse;
2096         }
2097         bh_unlock_sock(sk);
2098
2099 put_and_return:
2100         if (refcounted)
2101                 sock_put(sk);
2102
2103         return ret;
2104
2105 no_tcp_socket:
2106         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2107         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2108                 goto discard_it;
2109
2110         tcp_v4_fill_cb(skb, iph, th);
2111
2112         if (tcp_checksum_complete(skb)) {
2113 csum_error:
2114                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2115                 trace_tcp_bad_csum(skb);
2116                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2117 bad_packet:
2118                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2119         } else {
2120                 tcp_v4_send_reset(NULL, skb);
2121         }
2122
2123 discard_it:
2124         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2125         /* Discard frame. */
2126         kfree_skb_reason(skb, drop_reason);
2127         return 0;
2128
2129 discard_and_relse:
2130         sk_drops_add(sk, skb);
2131         if (refcounted)
2132                 sock_put(sk);
2133         goto discard_it;
2134
2135 do_time_wait:
2136         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2137                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2138                 inet_twsk_put(inet_twsk(sk));
2139                 goto discard_it;
2140         }
2141
2142         tcp_v4_fill_cb(skb, iph, th);
2143
2144         if (tcp_checksum_complete(skb)) {
2145                 inet_twsk_put(inet_twsk(sk));
2146                 goto csum_error;
2147         }
2148         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2149         case TCP_TW_SYN: {
2150                 struct sock *sk2 = inet_lookup_listener(net,
2151                                                         net->ipv4.tcp_death_row.hashinfo,
2152                                                         skb, __tcp_hdrlen(th),
2153                                                         iph->saddr, th->source,
2154                                                         iph->daddr, th->dest,
2155                                                         inet_iif(skb),
2156                                                         sdif);
2157                 if (sk2) {
2158                         inet_twsk_deschedule_put(inet_twsk(sk));
2159                         sk = sk2;
2160                         tcp_v4_restore_cb(skb);
2161                         refcounted = false;
2162                         goto process;
2163                 }
2164         }
2165                 /* to ACK */
2166                 fallthrough;
2167         case TCP_TW_ACK:
2168                 tcp_v4_timewait_ack(sk, skb);
2169                 break;
2170         case TCP_TW_RST:
2171                 tcp_v4_send_reset(sk, skb);
2172                 inet_twsk_deschedule_put(inet_twsk(sk));
2173                 goto discard_it;
2174         case TCP_TW_SUCCESS:;
2175         }
2176         goto discard_it;
2177 }
2178
2179 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2180         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2181         .twsk_unique    = tcp_twsk_unique,
2182         .twsk_destructor= tcp_twsk_destructor,
2183 };
2184
2185 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2186 {
2187         struct dst_entry *dst = skb_dst(skb);
2188
2189         if (dst && dst_hold_safe(dst)) {
2190                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2191                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2192         }
2193 }
2194 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2195
2196 const struct inet_connection_sock_af_ops ipv4_specific = {
2197         .queue_xmit        = ip_queue_xmit,
2198         .send_check        = tcp_v4_send_check,
2199         .rebuild_header    = inet_sk_rebuild_header,
2200         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2201         .conn_request      = tcp_v4_conn_request,
2202         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2203         .net_header_len    = sizeof(struct iphdr),
2204         .setsockopt        = ip_setsockopt,
2205         .getsockopt        = ip_getsockopt,
2206         .addr2sockaddr     = inet_csk_addr2sockaddr,
2207         .sockaddr_len      = sizeof(struct sockaddr_in),
2208         .mtu_reduced       = tcp_v4_mtu_reduced,
2209 };
2210 EXPORT_SYMBOL(ipv4_specific);
2211
2212 #ifdef CONFIG_TCP_MD5SIG
2213 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2214         .md5_lookup             = tcp_v4_md5_lookup,
2215         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2216         .md5_parse              = tcp_v4_parse_md5_keys,
2217 };
2218 #endif
2219
2220 /* NOTE: A lot of things set to zero explicitly by call to
2221  *       sk_alloc() so need not be done here.
2222  */
2223 static int tcp_v4_init_sock(struct sock *sk)
2224 {
2225         struct inet_connection_sock *icsk = inet_csk(sk);
2226
2227         tcp_init_sock(sk);
2228
2229         icsk->icsk_af_ops = &ipv4_specific;
2230
2231 #ifdef CONFIG_TCP_MD5SIG
2232         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2233 #endif
2234
2235         return 0;
2236 }
2237
2238 void tcp_v4_destroy_sock(struct sock *sk)
2239 {
2240         struct tcp_sock *tp = tcp_sk(sk);
2241
2242         trace_tcp_destroy_sock(sk);
2243
2244         tcp_clear_xmit_timers(sk);
2245
2246         tcp_cleanup_congestion_control(sk);
2247
2248         tcp_cleanup_ulp(sk);
2249
2250         /* Cleanup up the write buffer. */
2251         tcp_write_queue_purge(sk);
2252
2253         /* Check if we want to disable active TFO */
2254         tcp_fastopen_active_disable_ofo_check(sk);
2255
2256         /* Cleans up our, hopefully empty, out_of_order_queue. */
2257         skb_rbtree_purge(&tp->out_of_order_queue);
2258
2259 #ifdef CONFIG_TCP_MD5SIG
2260         /* Clean up the MD5 key list, if any */
2261         if (tp->md5sig_info) {
2262                 tcp_clear_md5_list(sk);
2263                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2264                 tp->md5sig_info = NULL;
2265         }
2266 #endif
2267
2268         /* Clean up a referenced TCP bind bucket. */
2269         if (inet_csk(sk)->icsk_bind_hash)
2270                 inet_put_port(sk);
2271
2272         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2273
2274         /* If socket is aborted during connect operation */
2275         tcp_free_fastopen_req(tp);
2276         tcp_fastopen_destroy_cipher(sk);
2277         tcp_saved_syn_free(tp);
2278
2279         sk_sockets_allocated_dec(sk);
2280 }
2281 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2282
2283 #ifdef CONFIG_PROC_FS
2284 /* Proc filesystem TCP sock list dumping. */
2285
2286 static unsigned short seq_file_family(const struct seq_file *seq);
2287
2288 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2289 {
2290         unsigned short family = seq_file_family(seq);
2291
2292         /* AF_UNSPEC is used as a match all */
2293         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2294                 net_eq(sock_net(sk), seq_file_net(seq)));
2295 }
2296
2297 /* Find a non empty bucket (starting from st->bucket)
2298  * and return the first sk from it.
2299  */
2300 static void *listening_get_first(struct seq_file *seq)
2301 {
2302         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2303         struct tcp_iter_state *st = seq->private;
2304
2305         st->offset = 0;
2306         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2307                 struct inet_listen_hashbucket *ilb2;
2308                 struct hlist_nulls_node *node;
2309                 struct sock *sk;
2310
2311                 ilb2 = &hinfo->lhash2[st->bucket];
2312                 if (hlist_nulls_empty(&ilb2->nulls_head))
2313                         continue;
2314
2315                 spin_lock(&ilb2->lock);
2316                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2317                         if (seq_sk_match(seq, sk))
2318                                 return sk;
2319                 }
2320                 spin_unlock(&ilb2->lock);
2321         }
2322
2323         return NULL;
2324 }
2325
2326 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2327  * If "cur" is the last one in the st->bucket,
2328  * call listening_get_first() to return the first sk of the next
2329  * non empty bucket.
2330  */
2331 static void *listening_get_next(struct seq_file *seq, void *cur)
2332 {
2333         struct tcp_iter_state *st = seq->private;
2334         struct inet_listen_hashbucket *ilb2;
2335         struct hlist_nulls_node *node;
2336         struct inet_hashinfo *hinfo;
2337         struct sock *sk = cur;
2338
2339         ++st->num;
2340         ++st->offset;
2341
2342         sk = sk_nulls_next(sk);
2343         sk_nulls_for_each_from(sk, node) {
2344                 if (seq_sk_match(seq, sk))
2345                         return sk;
2346         }
2347
2348         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2349         ilb2 = &hinfo->lhash2[st->bucket];
2350         spin_unlock(&ilb2->lock);
2351         ++st->bucket;
2352         return listening_get_first(seq);
2353 }
2354
2355 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2356 {
2357         struct tcp_iter_state *st = seq->private;
2358         void *rc;
2359
2360         st->bucket = 0;
2361         st->offset = 0;
2362         rc = listening_get_first(seq);
2363
2364         while (rc && *pos) {
2365                 rc = listening_get_next(seq, rc);
2366                 --*pos;
2367         }
2368         return rc;
2369 }
2370
2371 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2372                                 const struct tcp_iter_state *st)
2373 {
2374         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2375 }
2376
2377 /*
2378  * Get first established socket starting from bucket given in st->bucket.
2379  * If st->bucket is zero, the very first socket in the hash is returned.
2380  */
2381 static void *established_get_first(struct seq_file *seq)
2382 {
2383         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2384         struct tcp_iter_state *st = seq->private;
2385
2386         st->offset = 0;
2387         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2388                 struct sock *sk;
2389                 struct hlist_nulls_node *node;
2390                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2391
2392                 /* Lockless fast path for the common case of empty buckets */
2393                 if (empty_bucket(hinfo, st))
2394                         continue;
2395
2396                 spin_lock_bh(lock);
2397                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2398                         if (seq_sk_match(seq, sk))
2399                                 return sk;
2400                 }
2401                 spin_unlock_bh(lock);
2402         }
2403
2404         return NULL;
2405 }
2406
2407 static void *established_get_next(struct seq_file *seq, void *cur)
2408 {
2409         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2410         struct tcp_iter_state *st = seq->private;
2411         struct hlist_nulls_node *node;
2412         struct sock *sk = cur;
2413
2414         ++st->num;
2415         ++st->offset;
2416
2417         sk = sk_nulls_next(sk);
2418
2419         sk_nulls_for_each_from(sk, node) {
2420                 if (seq_sk_match(seq, sk))
2421                         return sk;
2422         }
2423
2424         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2425         ++st->bucket;
2426         return established_get_first(seq);
2427 }
2428
2429 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2430 {
2431         struct tcp_iter_state *st = seq->private;
2432         void *rc;
2433
2434         st->bucket = 0;
2435         rc = established_get_first(seq);
2436
2437         while (rc && pos) {
2438                 rc = established_get_next(seq, rc);
2439                 --pos;
2440         }
2441         return rc;
2442 }
2443
2444 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2445 {
2446         void *rc;
2447         struct tcp_iter_state *st = seq->private;
2448
2449         st->state = TCP_SEQ_STATE_LISTENING;
2450         rc        = listening_get_idx(seq, &pos);
2451
2452         if (!rc) {
2453                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2454                 rc        = established_get_idx(seq, pos);
2455         }
2456
2457         return rc;
2458 }
2459
2460 static void *tcp_seek_last_pos(struct seq_file *seq)
2461 {
2462         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2463         struct tcp_iter_state *st = seq->private;
2464         int bucket = st->bucket;
2465         int offset = st->offset;
2466         int orig_num = st->num;
2467         void *rc = NULL;
2468
2469         switch (st->state) {
2470         case TCP_SEQ_STATE_LISTENING:
2471                 if (st->bucket > hinfo->lhash2_mask)
2472                         break;
2473                 st->state = TCP_SEQ_STATE_LISTENING;
2474                 rc = listening_get_first(seq);
2475                 while (offset-- && rc && bucket == st->bucket)
2476                         rc = listening_get_next(seq, rc);
2477                 if (rc)
2478                         break;
2479                 st->bucket = 0;
2480                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2481                 fallthrough;
2482         case TCP_SEQ_STATE_ESTABLISHED:
2483                 if (st->bucket > hinfo->ehash_mask)
2484                         break;
2485                 rc = established_get_first(seq);
2486                 while (offset-- && rc && bucket == st->bucket)
2487                         rc = established_get_next(seq, rc);
2488         }
2489
2490         st->num = orig_num;
2491
2492         return rc;
2493 }
2494
2495 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2496 {
2497         struct tcp_iter_state *st = seq->private;
2498         void *rc;
2499
2500         if (*pos && *pos == st->last_pos) {
2501                 rc = tcp_seek_last_pos(seq);
2502                 if (rc)
2503                         goto out;
2504         }
2505
2506         st->state = TCP_SEQ_STATE_LISTENING;
2507         st->num = 0;
2508         st->bucket = 0;
2509         st->offset = 0;
2510         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2511
2512 out:
2513         st->last_pos = *pos;
2514         return rc;
2515 }
2516 EXPORT_SYMBOL(tcp_seq_start);
2517
2518 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2519 {
2520         struct tcp_iter_state *st = seq->private;
2521         void *rc = NULL;
2522
2523         if (v == SEQ_START_TOKEN) {
2524                 rc = tcp_get_idx(seq, 0);
2525                 goto out;
2526         }
2527
2528         switch (st->state) {
2529         case TCP_SEQ_STATE_LISTENING:
2530                 rc = listening_get_next(seq, v);
2531                 if (!rc) {
2532                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2533                         st->bucket = 0;
2534                         st->offset = 0;
2535                         rc        = established_get_first(seq);
2536                 }
2537                 break;
2538         case TCP_SEQ_STATE_ESTABLISHED:
2539                 rc = established_get_next(seq, v);
2540                 break;
2541         }
2542 out:
2543         ++*pos;
2544         st->last_pos = *pos;
2545         return rc;
2546 }
2547 EXPORT_SYMBOL(tcp_seq_next);
2548
2549 void tcp_seq_stop(struct seq_file *seq, void *v)
2550 {
2551         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2552         struct tcp_iter_state *st = seq->private;
2553
2554         switch (st->state) {
2555         case TCP_SEQ_STATE_LISTENING:
2556                 if (v != SEQ_START_TOKEN)
2557                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2558                 break;
2559         case TCP_SEQ_STATE_ESTABLISHED:
2560                 if (v)
2561                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2562                 break;
2563         }
2564 }
2565 EXPORT_SYMBOL(tcp_seq_stop);
2566
2567 static void get_openreq4(const struct request_sock *req,
2568                          struct seq_file *f, int i)
2569 {
2570         const struct inet_request_sock *ireq = inet_rsk(req);
2571         long delta = req->rsk_timer.expires - jiffies;
2572
2573         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2574                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2575                 i,
2576                 ireq->ir_loc_addr,
2577                 ireq->ir_num,
2578                 ireq->ir_rmt_addr,
2579                 ntohs(ireq->ir_rmt_port),
2580                 TCP_SYN_RECV,
2581                 0, 0, /* could print option size, but that is af dependent. */
2582                 1,    /* timers active (only the expire timer) */
2583                 jiffies_delta_to_clock_t(delta),
2584                 req->num_timeout,
2585                 from_kuid_munged(seq_user_ns(f),
2586                                  sock_i_uid(req->rsk_listener)),
2587                 0,  /* non standard timer */
2588                 0, /* open_requests have no inode */
2589                 0,
2590                 req);
2591 }
2592
2593 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2594 {
2595         int timer_active;
2596         unsigned long timer_expires;
2597         const struct tcp_sock *tp = tcp_sk(sk);
2598         const struct inet_connection_sock *icsk = inet_csk(sk);
2599         const struct inet_sock *inet = inet_sk(sk);
2600         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2601         __be32 dest = inet->inet_daddr;
2602         __be32 src = inet->inet_rcv_saddr;
2603         __u16 destp = ntohs(inet->inet_dport);
2604         __u16 srcp = ntohs(inet->inet_sport);
2605         int rx_queue;
2606         int state;
2607
2608         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2609             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2610             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2611                 timer_active    = 1;
2612                 timer_expires   = icsk->icsk_timeout;
2613         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2614                 timer_active    = 4;
2615                 timer_expires   = icsk->icsk_timeout;
2616         } else if (timer_pending(&sk->sk_timer)) {
2617                 timer_active    = 2;
2618                 timer_expires   = sk->sk_timer.expires;
2619         } else {
2620                 timer_active    = 0;
2621                 timer_expires = jiffies;
2622         }
2623
2624         state = inet_sk_state_load(sk);
2625         if (state == TCP_LISTEN)
2626                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2627         else
2628                 /* Because we don't lock the socket,
2629                  * we might find a transient negative value.
2630                  */
2631                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2632                                       READ_ONCE(tp->copied_seq), 0);
2633
2634         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2635                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2636                 i, src, srcp, dest, destp, state,
2637                 READ_ONCE(tp->write_seq) - tp->snd_una,
2638                 rx_queue,
2639                 timer_active,
2640                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2641                 icsk->icsk_retransmits,
2642                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2643                 icsk->icsk_probes_out,
2644                 sock_i_ino(sk),
2645                 refcount_read(&sk->sk_refcnt), sk,
2646                 jiffies_to_clock_t(icsk->icsk_rto),
2647                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2648                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2649                 tcp_snd_cwnd(tp),
2650                 state == TCP_LISTEN ?
2651                     fastopenq->max_qlen :
2652                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2653 }
2654
2655 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2656                                struct seq_file *f, int i)
2657 {
2658         long delta = tw->tw_timer.expires - jiffies;
2659         __be32 dest, src;
2660         __u16 destp, srcp;
2661
2662         dest  = tw->tw_daddr;
2663         src   = tw->tw_rcv_saddr;
2664         destp = ntohs(tw->tw_dport);
2665         srcp  = ntohs(tw->tw_sport);
2666
2667         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2668                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2669                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2670                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2671                 refcount_read(&tw->tw_refcnt), tw);
2672 }
2673
2674 #define TMPSZ 150
2675
2676 static int tcp4_seq_show(struct seq_file *seq, void *v)
2677 {
2678         struct tcp_iter_state *st;
2679         struct sock *sk = v;
2680
2681         seq_setwidth(seq, TMPSZ - 1);
2682         if (v == SEQ_START_TOKEN) {
2683                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2684                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2685                            "inode");
2686                 goto out;
2687         }
2688         st = seq->private;
2689
2690         if (sk->sk_state == TCP_TIME_WAIT)
2691                 get_timewait4_sock(v, seq, st->num);
2692         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2693                 get_openreq4(v, seq, st->num);
2694         else
2695                 get_tcp4_sock(v, seq, st->num);
2696 out:
2697         seq_pad(seq, '\n');
2698         return 0;
2699 }
2700
2701 #ifdef CONFIG_BPF_SYSCALL
2702 struct bpf_tcp_iter_state {
2703         struct tcp_iter_state state;
2704         unsigned int cur_sk;
2705         unsigned int end_sk;
2706         unsigned int max_sk;
2707         struct sock **batch;
2708         bool st_bucket_done;
2709 };
2710
2711 struct bpf_iter__tcp {
2712         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2713         __bpf_md_ptr(struct sock_common *, sk_common);
2714         uid_t uid __aligned(8);
2715 };
2716
2717 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2718                              struct sock_common *sk_common, uid_t uid)
2719 {
2720         struct bpf_iter__tcp ctx;
2721
2722         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2723         ctx.meta = meta;
2724         ctx.sk_common = sk_common;
2725         ctx.uid = uid;
2726         return bpf_iter_run_prog(prog, &ctx);
2727 }
2728
2729 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2730 {
2731         while (iter->cur_sk < iter->end_sk)
2732                 sock_gen_put(iter->batch[iter->cur_sk++]);
2733 }
2734
2735 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2736                                       unsigned int new_batch_sz)
2737 {
2738         struct sock **new_batch;
2739
2740         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2741                              GFP_USER | __GFP_NOWARN);
2742         if (!new_batch)
2743                 return -ENOMEM;
2744
2745         bpf_iter_tcp_put_batch(iter);
2746         kvfree(iter->batch);
2747         iter->batch = new_batch;
2748         iter->max_sk = new_batch_sz;
2749
2750         return 0;
2751 }
2752
2753 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2754                                                  struct sock *start_sk)
2755 {
2756         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2757         struct bpf_tcp_iter_state *iter = seq->private;
2758         struct tcp_iter_state *st = &iter->state;
2759         struct hlist_nulls_node *node;
2760         unsigned int expected = 1;
2761         struct sock *sk;
2762
2763         sock_hold(start_sk);
2764         iter->batch[iter->end_sk++] = start_sk;
2765
2766         sk = sk_nulls_next(start_sk);
2767         sk_nulls_for_each_from(sk, node) {
2768                 if (seq_sk_match(seq, sk)) {
2769                         if (iter->end_sk < iter->max_sk) {
2770                                 sock_hold(sk);
2771                                 iter->batch[iter->end_sk++] = sk;
2772                         }
2773                         expected++;
2774                 }
2775         }
2776         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2777
2778         return expected;
2779 }
2780
2781 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2782                                                    struct sock *start_sk)
2783 {
2784         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2785         struct bpf_tcp_iter_state *iter = seq->private;
2786         struct tcp_iter_state *st = &iter->state;
2787         struct hlist_nulls_node *node;
2788         unsigned int expected = 1;
2789         struct sock *sk;
2790
2791         sock_hold(start_sk);
2792         iter->batch[iter->end_sk++] = start_sk;
2793
2794         sk = sk_nulls_next(start_sk);
2795         sk_nulls_for_each_from(sk, node) {
2796                 if (seq_sk_match(seq, sk)) {
2797                         if (iter->end_sk < iter->max_sk) {
2798                                 sock_hold(sk);
2799                                 iter->batch[iter->end_sk++] = sk;
2800                         }
2801                         expected++;
2802                 }
2803         }
2804         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2805
2806         return expected;
2807 }
2808
2809 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2810 {
2811         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2812         struct bpf_tcp_iter_state *iter = seq->private;
2813         struct tcp_iter_state *st = &iter->state;
2814         unsigned int expected;
2815         bool resized = false;
2816         struct sock *sk;
2817
2818         /* The st->bucket is done.  Directly advance to the next
2819          * bucket instead of having the tcp_seek_last_pos() to skip
2820          * one by one in the current bucket and eventually find out
2821          * it has to advance to the next bucket.
2822          */
2823         if (iter->st_bucket_done) {
2824                 st->offset = 0;
2825                 st->bucket++;
2826                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2827                     st->bucket > hinfo->lhash2_mask) {
2828                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2829                         st->bucket = 0;
2830                 }
2831         }
2832
2833 again:
2834         /* Get a new batch */
2835         iter->cur_sk = 0;
2836         iter->end_sk = 0;
2837         iter->st_bucket_done = false;
2838
2839         sk = tcp_seek_last_pos(seq);
2840         if (!sk)
2841                 return NULL; /* Done */
2842
2843         if (st->state == TCP_SEQ_STATE_LISTENING)
2844                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2845         else
2846                 expected = bpf_iter_tcp_established_batch(seq, sk);
2847
2848         if (iter->end_sk == expected) {
2849                 iter->st_bucket_done = true;
2850                 return sk;
2851         }
2852
2853         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2854                 resized = true;
2855                 goto again;
2856         }
2857
2858         return sk;
2859 }
2860
2861 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2862 {
2863         /* bpf iter does not support lseek, so it always
2864          * continue from where it was stop()-ped.
2865          */
2866         if (*pos)
2867                 return bpf_iter_tcp_batch(seq);
2868
2869         return SEQ_START_TOKEN;
2870 }
2871
2872 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2873 {
2874         struct bpf_tcp_iter_state *iter = seq->private;
2875         struct tcp_iter_state *st = &iter->state;
2876         struct sock *sk;
2877
2878         /* Whenever seq_next() is called, the iter->cur_sk is
2879          * done with seq_show(), so advance to the next sk in
2880          * the batch.
2881          */
2882         if (iter->cur_sk < iter->end_sk) {
2883                 /* Keeping st->num consistent in tcp_iter_state.
2884                  * bpf_iter_tcp does not use st->num.
2885                  * meta.seq_num is used instead.
2886                  */
2887                 st->num++;
2888                 /* Move st->offset to the next sk in the bucket such that
2889                  * the future start() will resume at st->offset in
2890                  * st->bucket.  See tcp_seek_last_pos().
2891                  */
2892                 st->offset++;
2893                 sock_gen_put(iter->batch[iter->cur_sk++]);
2894         }
2895
2896         if (iter->cur_sk < iter->end_sk)
2897                 sk = iter->batch[iter->cur_sk];
2898         else
2899                 sk = bpf_iter_tcp_batch(seq);
2900
2901         ++*pos;
2902         /* Keeping st->last_pos consistent in tcp_iter_state.
2903          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2904          */
2905         st->last_pos = *pos;
2906         return sk;
2907 }
2908
2909 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2910 {
2911         struct bpf_iter_meta meta;
2912         struct bpf_prog *prog;
2913         struct sock *sk = v;
2914         bool slow;
2915         uid_t uid;
2916         int ret;
2917
2918         if (v == SEQ_START_TOKEN)
2919                 return 0;
2920
2921         if (sk_fullsock(sk))
2922                 slow = lock_sock_fast(sk);
2923
2924         if (unlikely(sk_unhashed(sk))) {
2925                 ret = SEQ_SKIP;
2926                 goto unlock;
2927         }
2928
2929         if (sk->sk_state == TCP_TIME_WAIT) {
2930                 uid = 0;
2931         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2932                 const struct request_sock *req = v;
2933
2934                 uid = from_kuid_munged(seq_user_ns(seq),
2935                                        sock_i_uid(req->rsk_listener));
2936         } else {
2937                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2938         }
2939
2940         meta.seq = seq;
2941         prog = bpf_iter_get_info(&meta, false);
2942         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2943
2944 unlock:
2945         if (sk_fullsock(sk))
2946                 unlock_sock_fast(sk, slow);
2947         return ret;
2948
2949 }
2950
2951 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2952 {
2953         struct bpf_tcp_iter_state *iter = seq->private;
2954         struct bpf_iter_meta meta;
2955         struct bpf_prog *prog;
2956
2957         if (!v) {
2958                 meta.seq = seq;
2959                 prog = bpf_iter_get_info(&meta, true);
2960                 if (prog)
2961                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2962         }
2963
2964         if (iter->cur_sk < iter->end_sk) {
2965                 bpf_iter_tcp_put_batch(iter);
2966                 iter->st_bucket_done = false;
2967         }
2968 }
2969
2970 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2971         .show           = bpf_iter_tcp_seq_show,
2972         .start          = bpf_iter_tcp_seq_start,
2973         .next           = bpf_iter_tcp_seq_next,
2974         .stop           = bpf_iter_tcp_seq_stop,
2975 };
2976 #endif
2977 static unsigned short seq_file_family(const struct seq_file *seq)
2978 {
2979         const struct tcp_seq_afinfo *afinfo;
2980
2981 #ifdef CONFIG_BPF_SYSCALL
2982         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2983         if (seq->op == &bpf_iter_tcp_seq_ops)
2984                 return AF_UNSPEC;
2985 #endif
2986
2987         /* Iterated from proc fs */
2988         afinfo = pde_data(file_inode(seq->file));
2989         return afinfo->family;
2990 }
2991
2992 static const struct seq_operations tcp4_seq_ops = {
2993         .show           = tcp4_seq_show,
2994         .start          = tcp_seq_start,
2995         .next           = tcp_seq_next,
2996         .stop           = tcp_seq_stop,
2997 };
2998
2999 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3000         .family         = AF_INET,
3001 };
3002
3003 static int __net_init tcp4_proc_init_net(struct net *net)
3004 {
3005         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3006                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3007                 return -ENOMEM;
3008         return 0;
3009 }
3010
3011 static void __net_exit tcp4_proc_exit_net(struct net *net)
3012 {
3013         remove_proc_entry("tcp", net->proc_net);
3014 }
3015
3016 static struct pernet_operations tcp4_net_ops = {
3017         .init = tcp4_proc_init_net,
3018         .exit = tcp4_proc_exit_net,
3019 };
3020
3021 int __init tcp4_proc_init(void)
3022 {
3023         return register_pernet_subsys(&tcp4_net_ops);
3024 }
3025
3026 void tcp4_proc_exit(void)
3027 {
3028         unregister_pernet_subsys(&tcp4_net_ops);
3029 }
3030 #endif /* CONFIG_PROC_FS */
3031
3032 /* @wake is one when sk_stream_write_space() calls us.
3033  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3034  * This mimics the strategy used in sock_def_write_space().
3035  */
3036 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3037 {
3038         const struct tcp_sock *tp = tcp_sk(sk);
3039         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3040                             READ_ONCE(tp->snd_nxt);
3041
3042         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3043 }
3044 EXPORT_SYMBOL(tcp_stream_memory_free);
3045
3046 struct proto tcp_prot = {
3047         .name                   = "TCP",
3048         .owner                  = THIS_MODULE,
3049         .close                  = tcp_close,
3050         .pre_connect            = tcp_v4_pre_connect,
3051         .connect                = tcp_v4_connect,
3052         .disconnect             = tcp_disconnect,
3053         .accept                 = inet_csk_accept,
3054         .ioctl                  = tcp_ioctl,
3055         .init                   = tcp_v4_init_sock,
3056         .destroy                = tcp_v4_destroy_sock,
3057         .shutdown               = tcp_shutdown,
3058         .setsockopt             = tcp_setsockopt,
3059         .getsockopt             = tcp_getsockopt,
3060         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3061         .keepalive              = tcp_set_keepalive,
3062         .recvmsg                = tcp_recvmsg,
3063         .sendmsg                = tcp_sendmsg,
3064         .sendpage               = tcp_sendpage,
3065         .backlog_rcv            = tcp_v4_do_rcv,
3066         .release_cb             = tcp_release_cb,
3067         .hash                   = inet_hash,
3068         .unhash                 = inet_unhash,
3069         .get_port               = inet_csk_get_port,
3070         .put_port               = inet_put_port,
3071 #ifdef CONFIG_BPF_SYSCALL
3072         .psock_update_sk_prot   = tcp_bpf_update_proto,
3073 #endif
3074         .enter_memory_pressure  = tcp_enter_memory_pressure,
3075         .leave_memory_pressure  = tcp_leave_memory_pressure,
3076         .stream_memory_free     = tcp_stream_memory_free,
3077         .sockets_allocated      = &tcp_sockets_allocated,
3078         .orphan_count           = &tcp_orphan_count,
3079
3080         .memory_allocated       = &tcp_memory_allocated,
3081         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3082
3083         .memory_pressure        = &tcp_memory_pressure,
3084         .sysctl_mem             = sysctl_tcp_mem,
3085         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3086         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3087         .max_header             = MAX_TCP_HEADER,
3088         .obj_size               = sizeof(struct tcp_sock),
3089         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3090         .twsk_prot              = &tcp_timewait_sock_ops,
3091         .rsk_prot               = &tcp_request_sock_ops,
3092         .h.hashinfo             = NULL,
3093         .no_autobind            = true,
3094         .diag_destroy           = tcp_abort,
3095 };
3096 EXPORT_SYMBOL(tcp_prot);
3097
3098 static void __net_exit tcp_sk_exit(struct net *net)
3099 {
3100         if (net->ipv4.tcp_congestion_control)
3101                 bpf_module_put(net->ipv4.tcp_congestion_control,
3102                                net->ipv4.tcp_congestion_control->owner);
3103 }
3104
3105 static void __net_init tcp_set_hashinfo(struct net *net)
3106 {
3107         struct inet_hashinfo *hinfo;
3108         unsigned int ehash_entries;
3109         struct net *old_net;
3110
3111         if (net_eq(net, &init_net))
3112                 goto fallback;
3113
3114         old_net = current->nsproxy->net_ns;
3115         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3116         if (!ehash_entries)
3117                 goto fallback;
3118
3119         ehash_entries = roundup_pow_of_two(ehash_entries);
3120         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3121         if (!hinfo) {
3122                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3123                         "for a netns, fallback to the global one\n",
3124                         ehash_entries);
3125 fallback:
3126                 hinfo = &tcp_hashinfo;
3127                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3128         }
3129
3130         net->ipv4.tcp_death_row.hashinfo = hinfo;
3131         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3132         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3133 }
3134
3135 static int __net_init tcp_sk_init(struct net *net)
3136 {
3137         net->ipv4.sysctl_tcp_ecn = 2;
3138         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3139
3140         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3141         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3142         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3143         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3144         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3145
3146         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3147         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3148         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3149
3150         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3151         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3152         net->ipv4.sysctl_tcp_syncookies = 1;
3153         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3154         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3155         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3156         net->ipv4.sysctl_tcp_orphan_retries = 0;
3157         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3158         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3159         net->ipv4.sysctl_tcp_tw_reuse = 2;
3160         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3161
3162         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3163         tcp_set_hashinfo(net);
3164
3165         net->ipv4.sysctl_tcp_sack = 1;
3166         net->ipv4.sysctl_tcp_window_scaling = 1;
3167         net->ipv4.sysctl_tcp_timestamps = 1;
3168         net->ipv4.sysctl_tcp_early_retrans = 3;
3169         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3170         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3171         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3172         net->ipv4.sysctl_tcp_max_reordering = 300;
3173         net->ipv4.sysctl_tcp_dsack = 1;
3174         net->ipv4.sysctl_tcp_app_win = 31;
3175         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3176         net->ipv4.sysctl_tcp_frto = 2;
3177         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3178         /* This limits the percentage of the congestion window which we
3179          * will allow a single TSO frame to consume.  Building TSO frames
3180          * which are too large can cause TCP streams to be bursty.
3181          */
3182         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3183         /* Default TSQ limit of 16 TSO segments */
3184         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3185
3186         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3187         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3188
3189         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3190         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3191         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3192         net->ipv4.sysctl_tcp_autocorking = 1;
3193         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3194         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3195         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3196         if (net != &init_net) {
3197                 memcpy(net->ipv4.sysctl_tcp_rmem,
3198                        init_net.ipv4.sysctl_tcp_rmem,
3199                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3200                 memcpy(net->ipv4.sysctl_tcp_wmem,
3201                        init_net.ipv4.sysctl_tcp_wmem,
3202                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3203         }
3204         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3205         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3206         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3207         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3208         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3209         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3210
3211         /* Reno is always built in */
3212         if (!net_eq(net, &init_net) &&
3213             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3214                                init_net.ipv4.tcp_congestion_control->owner))
3215                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3216         else
3217                 net->ipv4.tcp_congestion_control = &tcp_reno;
3218
3219         return 0;
3220 }
3221
3222 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3223 {
3224         struct net *net;
3225
3226         tcp_twsk_purge(net_exit_list, AF_INET);
3227
3228         list_for_each_entry(net, net_exit_list, exit_list) {
3229                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3230                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3231                 tcp_fastopen_ctx_destroy(net);
3232         }
3233 }
3234
3235 static struct pernet_operations __net_initdata tcp_sk_ops = {
3236        .init       = tcp_sk_init,
3237        .exit       = tcp_sk_exit,
3238        .exit_batch = tcp_sk_exit_batch,
3239 };
3240
3241 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3242 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3243                      struct sock_common *sk_common, uid_t uid)
3244
3245 #define INIT_BATCH_SZ 16
3246
3247 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3248 {
3249         struct bpf_tcp_iter_state *iter = priv_data;
3250         int err;
3251
3252         err = bpf_iter_init_seq_net(priv_data, aux);
3253         if (err)
3254                 return err;
3255
3256         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3257         if (err) {
3258                 bpf_iter_fini_seq_net(priv_data);
3259                 return err;
3260         }
3261
3262         return 0;
3263 }
3264
3265 static void bpf_iter_fini_tcp(void *priv_data)
3266 {
3267         struct bpf_tcp_iter_state *iter = priv_data;
3268
3269         bpf_iter_fini_seq_net(priv_data);
3270         kvfree(iter->batch);
3271 }
3272
3273 static const struct bpf_iter_seq_info tcp_seq_info = {
3274         .seq_ops                = &bpf_iter_tcp_seq_ops,
3275         .init_seq_private       = bpf_iter_init_tcp,
3276         .fini_seq_private       = bpf_iter_fini_tcp,
3277         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3278 };
3279
3280 static const struct bpf_func_proto *
3281 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3282                             const struct bpf_prog *prog)
3283 {
3284         switch (func_id) {
3285         case BPF_FUNC_setsockopt:
3286                 return &bpf_sk_setsockopt_proto;
3287         case BPF_FUNC_getsockopt:
3288                 return &bpf_sk_getsockopt_proto;
3289         default:
3290                 return NULL;
3291         }
3292 }
3293
3294 static struct bpf_iter_reg tcp_reg_info = {
3295         .target                 = "tcp",
3296         .ctx_arg_info_size      = 1,
3297         .ctx_arg_info           = {
3298                 { offsetof(struct bpf_iter__tcp, sk_common),
3299                   PTR_TO_BTF_ID_OR_NULL },
3300         },
3301         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3302         .seq_info               = &tcp_seq_info,
3303 };
3304
3305 static void __init bpf_iter_register(void)
3306 {
3307         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3308         if (bpf_iter_reg_target(&tcp_reg_info))
3309                 pr_warn("Warning: could not register bpf iterator tcp\n");
3310 }
3311
3312 #endif
3313
3314 void __init tcp_v4_init(void)
3315 {
3316         int cpu, res;
3317
3318         for_each_possible_cpu(cpu) {
3319                 struct sock *sk;
3320
3321                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3322                                            IPPROTO_TCP, &init_net);
3323                 if (res)
3324                         panic("Failed to create the TCP control socket.\n");
3325                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3326
3327                 /* Please enforce IP_DF and IPID==0 for RST and
3328                  * ACK sent in SYN-RECV and TIME-WAIT state.
3329                  */
3330                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3331
3332                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3333         }
3334         if (register_pernet_subsys(&tcp_sk_ops))
3335                 panic("Failed to create the TCP control socket.\n");
3336
3337 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3338         bpf_iter_register();
3339 #endif
3340 }