net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
 203         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 204         struct inet_timewait_death_row *tcp_death_row;
 205         __be32 daddr, nexthop, prev_sk_rcv_saddr;
 206         struct inet_sock *inet = inet_sk(sk);
 207         struct tcp_sock *tp = tcp_sk(sk);
 208         struct ip_options_rcu *inet_opt;
 209         struct net *net = sock_net(sk);
 210         __be16 orig_sport, orig_dport;
 211         struct flowi4 *fl4;
 212         struct rtable *rt;
 213         int err;
 214
 215         if (addr_len < sizeof(struct sockaddr_in))
 216                 return -EINVAL;
 217
 218         if (usin->sin_family != AF_INET)
 219                 return -EAFNOSUPPORT;
 220
 221         nexthop = daddr = usin->sin_addr.s_addr;
 222         inet_opt = rcu_dereference_protected(inet->inet_opt,
 223                                              lockdep_sock_is_held(sk));
 224         if (inet_opt && inet_opt->opt.srr) {
 225                 if (!daddr)
 226                         return -EINVAL;
 227                 nexthop = inet_opt->opt.faddr;
 228         }
 229
 230         orig_sport = inet->inet_sport;
 231         orig_dport = usin->sin_port;
 232         fl4 = &inet->cork.fl.u.ip4;
 233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 235                               orig_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 if (err == -ENETUNREACH)
 239                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 240                 return err;
 241         }
 242
 243         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 244                 ip_rt_put(rt);
 245                 return -ENETUNREACH;
 246         }
 247
 248         if (!inet_opt || !inet_opt->opt.srr)
 249                 daddr = fl4->daddr;
 250
 251         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 252
 253         if (!inet->inet_saddr) {
 254                 if (inet_csk(sk)->icsk_bind2_hash) {
 255                         prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
 256                                                                      sk, net, inet->inet_num);
 257                         prev_sk_rcv_saddr = sk->sk_rcv_saddr;
 258                 }
 259                 inet->inet_saddr = fl4->saddr;
 260         }
 261
 262         sk_rcv_saddr_set(sk, inet->inet_saddr);
 263
 264         if (prev_addr_hashbucket) {
 265                 err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
 266                 if (err) {
 267                         inet->inet_saddr = 0;
 268                         sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
 269                         ip_rt_put(rt);
 270                         return err;
 271                 }
 272         }
 273
 274         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 275                 /* Reset inherited state */
 276                 tp->rx_opt.ts_recent       = 0;
 277                 tp->rx_opt.ts_recent_stamp = 0;
 278                 if (likely(!tp->repair))
 279                         WRITE_ONCE(tp->write_seq, 0);
 280         }
 281
 282         inet->inet_dport = usin->sin_port;
 283         sk_daddr_set(sk, daddr);
 284
 285         inet_csk(sk)->icsk_ext_hdr_len = 0;
 286         if (inet_opt)
 287                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 288
 289         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 290
 291         /* Socket identity is still unknown (sport may be zero).
 292          * However we set state to SYN-SENT and not releasing socket
 293          * lock select source port, enter ourselves into the hash tables and
 294          * complete initialization after this.
 295          */
 296         tcp_set_state(sk, TCP_SYN_SENT);
 297         err = inet_hash_connect(tcp_death_row, sk);
 298         if (err)
 299                 goto failure;
 300
 301         sk_set_txhash(sk);
 302
 303         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 304                                inet->inet_sport, inet->inet_dport, sk);
 305         if (IS_ERR(rt)) {
 306                 err = PTR_ERR(rt);
 307                 rt = NULL;
 308                 goto failure;
 309         }
 310         /* OK, now commit destination to socket.  */
 311         sk->sk_gso_type = SKB_GSO_TCPV4;
 312         sk_setup_caps(sk, &rt->dst);
 313         rt = NULL;
 314
 315         if (likely(!tp->repair)) {
 316                 if (!tp->write_seq)
 317                         WRITE_ONCE(tp->write_seq,
 318                                    secure_tcp_seq(inet->inet_saddr,
 319                                                   inet->inet_daddr,
 320                                                   inet->inet_sport,
 321                                                   usin->sin_port));
 322                 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
 323                                                  inet->inet_daddr);
 324         }
 325
 326         inet->inet_id = get_random_u16();
 327
 328         if (tcp_fastopen_defer_connect(sk, &err))
 329                 return err;
 330         if (err)
 331                 goto failure;
 332
 333         err = tcp_connect(sk);
 334
 335         if (err)
 336                 goto failure;
 337
 338         return 0;
 339
 340 failure:
 341         /*
 342          * This unhashes the socket and releases the local port,
 343          * if necessary.
 344          */
 345         tcp_set_state(sk, TCP_CLOSE);
 346         ip_rt_put(rt);
 347         sk->sk_route_caps = 0;
 348         inet->inet_dport = 0;
 349         return err;
 350 }
 351 EXPORT_SYMBOL(tcp_v4_connect);
 352
 353 /*
 354  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 355  * It can be called through tcp_release_cb() if socket was owned by user
 356  * at the time tcp_v4_err() was called to handle ICMP message.
 357  */
 358 void tcp_v4_mtu_reduced(struct sock *sk)
 359 {
 360         struct inet_sock *inet = inet_sk(sk);
 361         struct dst_entry *dst;
 362         u32 mtu;
 363
 364         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 365                 return;
 366         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 367         dst = inet_csk_update_pmtu(sk, mtu);
 368         if (!dst)
 369                 return;
 370
 371         /* Something is about to be wrong... Remember soft error
 372          * for the case, if this connection will not able to recover.
 373          */
 374         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 375                 sk->sk_err_soft = EMSGSIZE;
 376
 377         mtu = dst_mtu(dst);
 378
 379         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 380             ip_sk_accept_pmtu(sk) &&
 381             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 382                 tcp_sync_mss(sk, mtu);
 383
 384                 /* Resend the TCP packet because it's
 385                  * clear that the old packet has been
 386                  * dropped. This is the new "fast" path mtu
 387                  * discovery.
 388                  */
 389                 tcp_simple_retransmit(sk);
 390         } /* else let the usual retransmit timer handle it */
 391 }
 392 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 393
 394 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 395 {
 396         struct dst_entry *dst = __sk_dst_check(sk, 0);
 397
 398         if (dst)
 399                 dst->ops->redirect(dst, sk, skb);
 400 }
 401
 402
 403 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 404 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 405 {
 406         struct request_sock *req = inet_reqsk(sk);
 407         struct net *net = sock_net(sk);
 408
 409         /* ICMPs are not backlogged, hence we cannot get
 410          * an established socket here.
 411          */
 412         if (seq != tcp_rsk(req)->snt_isn) {
 413                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 414         } else if (abort) {
 415                 /*
 416                  * Still in SYN_RECV, just remove it silently.
 417                  * There is no good way to pass the error to the newly
 418                  * created socket, and POSIX does not want network
 419                  * errors returned from accept().
 420                  */
 421                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 422                 tcp_listendrop(req->rsk_listener);
 423         }
 424         reqsk_put(req);
 425 }
 426 EXPORT_SYMBOL(tcp_req_err);
 427
 428 /* TCP-LD (RFC 6069) logic */
 429 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 430 {
 431         struct inet_connection_sock *icsk = inet_csk(sk);
 432         struct tcp_sock *tp = tcp_sk(sk);
 433         struct sk_buff *skb;
 434         s32 remaining;
 435         u32 delta_us;
 436
 437         if (sock_owned_by_user(sk))
 438                 return;
 439
 440         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 441             !icsk->icsk_backoff)
 442                 return;
 443
 444         skb = tcp_rtx_queue_head(sk);
 445         if (WARN_ON_ONCE(!skb))
 446                 return;
 447
 448         icsk->icsk_backoff--;
 449         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 450         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 451
 452         tcp_mstamp_refresh(tp);
 453         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 454         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 455
 456         if (remaining > 0) {
 457                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 458                                           remaining, TCP_RTO_MAX);
 459         } else {
 460                 /* RTO revert clocked out retransmission.
 461                  * Will retransmit now.
 462                  */
 463                 tcp_retransmit_timer(sk);
 464         }
 465 }
 466 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 467
 468 /*
 469  * This routine is called by the ICMP module when it gets some
 470  * sort of error condition.  If err < 0 then the socket should
 471  * be closed and the error returned to the user.  If err > 0
 472  * it's just the icmp type << 8 | icmp code.  After adjustment
 473  * header points to the first 8 bytes of the tcp header.  We need
 474  * to find the appropriate port.
 475  *
 476  * The locking strategy used here is very "optimistic". When
 477  * someone else accesses the socket the ICMP is just dropped
 478  * and for some paths there is no check at all.
 479  * A more general error queue to queue errors for later handling
 480  * is probably better.
 481  *
 482  */
 483
 484 int tcp_v4_err(struct sk_buff *skb, u32 info)
 485 {
 486         const struct iphdr *iph = (const struct iphdr *)skb->data;
 487         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 488         struct tcp_sock *tp;
 489         struct inet_sock *inet;
 490         const int type = icmp_hdr(skb)->type;
 491         const int code = icmp_hdr(skb)->code;
 492         struct sock *sk;
 493         struct request_sock *fastopen;
 494         u32 seq, snd_una;
 495         int err;
 496         struct net *net = dev_net(skb->dev);
 497
 498         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
 499                                        iph->daddr, th->dest, iph->saddr,
 500                                        ntohs(th->source), inet_iif(skb), 0);
 501         if (!sk) {
 502                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 503                 return -ENOENT;
 504         }
 505         if (sk->sk_state == TCP_TIME_WAIT) {
 506                 inet_twsk_put(inet_twsk(sk));
 507                 return 0;
 508         }
 509         seq = ntohl(th->seq);
 510         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 511                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 512                                      type == ICMP_TIME_EXCEEDED ||
 513                                      (type == ICMP_DEST_UNREACH &&
 514                                       (code == ICMP_NET_UNREACH ||
 515                                        code == ICMP_HOST_UNREACH)));
 516                 return 0;
 517         }
 518
 519         bh_lock_sock(sk);
 520         /* If too many ICMPs get dropped on busy
 521          * servers this needs to be solved differently.
 522          * We do take care of PMTU discovery (RFC1191) special case :
 523          * we can receive locally generated ICMP messages while socket is held.
 524          */
 525         if (sock_owned_by_user(sk)) {
 526                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 527                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 528         }
 529         if (sk->sk_state == TCP_CLOSE)
 530                 goto out;
 531
 532         if (static_branch_unlikely(&ip4_min_ttl)) {
 533                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 534                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 535                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 536                         goto out;
 537                 }
 538         }
 539
 540         tp = tcp_sk(sk);
 541         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 542         fastopen = rcu_dereference(tp->fastopen_rsk);
 543         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 544         if (sk->sk_state != TCP_LISTEN &&
 545             !between(seq, snd_una, tp->snd_nxt)) {
 546                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 547                 goto out;
 548         }
 549
 550         switch (type) {
 551         case ICMP_REDIRECT:
 552                 if (!sock_owned_by_user(sk))
 553                         do_redirect(skb, sk);
 554                 goto out;
 555         case ICMP_SOURCE_QUENCH:
 556                 /* Just silently ignore these. */
 557                 goto out;
 558         case ICMP_PARAMETERPROB:
 559                 err = EPROTO;
 560                 break;
 561         case ICMP_DEST_UNREACH:
 562                 if (code > NR_ICMP_UNREACH)
 563                         goto out;
 564
 565                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 566                         /* We are not interested in TCP_LISTEN and open_requests
 567                          * (SYN-ACKs send out by Linux are always <576bytes so
 568                          * they should go through unfragmented).
 569                          */
 570                         if (sk->sk_state == TCP_LISTEN)
 571                                 goto out;
 572
 573                         WRITE_ONCE(tp->mtu_info, info);
 574                         if (!sock_owned_by_user(sk)) {
 575                                 tcp_v4_mtu_reduced(sk);
 576                         } else {
 577                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 578                                         sock_hold(sk);
 579                         }
 580                         goto out;
 581                 }
 582
 583                 err = icmp_err_convert[code].errno;
 584                 /* check if this ICMP message allows revert of backoff.
 585                  * (see RFC 6069)
 586                  */
 587                 if (!fastopen &&
 588                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 589                         tcp_ld_RTO_revert(sk, seq);
 590                 break;
 591         case ICMP_TIME_EXCEEDED:
 592                 err = EHOSTUNREACH;
 593                 break;
 594         default:
 595                 goto out;
 596         }
 597
 598         switch (sk->sk_state) {
 599         case TCP_SYN_SENT:
 600         case TCP_SYN_RECV:
 601                 /* Only in fast or simultaneous open. If a fast open socket is
 602                  * already accepted it is treated as a connected one below.
 603                  */
 604                 if (fastopen && !fastopen->sk)
 605                         break;
 606
 607                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 608
 609                 if (!sock_owned_by_user(sk)) {
 610                         sk->sk_err = err;
 611
 612                         sk_error_report(sk);
 613
 614                         tcp_done(sk);
 615                 } else {
 616                         sk->sk_err_soft = err;
 617                 }
 618                 goto out;
 619         }
 620
 621         /* If we've already connected we will keep trying
 622          * until we time out, or the user gives up.
 623          *
 624          * rfc1122 4.2.3.9 allows to consider as hard errors
 625          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 626          * but it is obsoleted by pmtu discovery).
 627          *
 628          * Note, that in modern internet, where routing is unreliable
 629          * and in each dark corner broken firewalls sit, sending random
 630          * errors ordered by their masters even this two messages finally lose
 631          * their original sense (even Linux sends invalid PORT_UNREACHs)
 632          *
 633          * Now we are in compliance with RFCs.
 634          *                                                      --ANK (980905)
 635          */
 636
 637         inet = inet_sk(sk);
 638         if (!sock_owned_by_user(sk) && inet->recverr) {
 639                 sk->sk_err = err;
 640                 sk_error_report(sk);
 641         } else  { /* Only an error on timeout */
 642                 sk->sk_err_soft = err;
 643         }
 644
 645 out:
 646         bh_unlock_sock(sk);
 647         sock_put(sk);
 648         return 0;
 649 }
 650
 651 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 652 {
 653         struct tcphdr *th = tcp_hdr(skb);
 654
 655         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 656         skb->csum_start = skb_transport_header(skb) - skb->head;
 657         skb->csum_offset = offsetof(struct tcphdr, check);
 658 }
 659
 660 /* This routine computes an IPv4 TCP checksum. */
 661 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 662 {
 663         const struct inet_sock *inet = inet_sk(sk);
 664
 665         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 666 }
 667 EXPORT_SYMBOL(tcp_v4_send_check);
 668
 669 /*
 670  *      This routine will send an RST to the other tcp.
 671  *
 672  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 673  *                    for reset.
 674  *      Answer: if a packet caused RST, it is not for a socket
 675  *              existing in our system, if it is matched to a socket,
 676  *              it is just duplicate segment or bug in other side's TCP.
 677  *              So that we build reply only basing on parameters
 678  *              arrived with segment.
 679  *      Exception: precedence violation. We do not implement it in any case.
 680  */
 681
 682 #ifdef CONFIG_TCP_MD5SIG
 683 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 684 #else
 685 #define OPTION_BYTES sizeof(__be32)
 686 #endif
 687
 688 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 689 {
 690         const struct tcphdr *th = tcp_hdr(skb);
 691         struct {
 692                 struct tcphdr th;
 693                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 694         } rep;
 695         struct ip_reply_arg arg;
 696 #ifdef CONFIG_TCP_MD5SIG
 697         struct tcp_md5sig_key *key = NULL;
 698         const __u8 *hash_location = NULL;
 699         unsigned char newhash[16];
 700         int genhash;
 701         struct sock *sk1 = NULL;
 702 #endif
 703         u64 transmit_time = 0;
 704         struct sock *ctl_sk;
 705         struct net *net;
 706
 707         /* Never send a reset in response to a reset. */
 708         if (th->rst)
 709                 return;
 710
 711         /* If sk not NULL, it means we did a successful lookup and incoming
 712          * route had to be correct. prequeue might have dropped our dst.
 713          */
 714         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 715                 return;
 716
 717         /* Swap the send and the receive. */
 718         memset(&rep, 0, sizeof(rep));
 719         rep.th.dest   = th->source;
 720         rep.th.source = th->dest;
 721         rep.th.doff   = sizeof(struct tcphdr) / 4;
 722         rep.th.rst    = 1;
 723
 724         if (th->ack) {
 725                 rep.th.seq = th->ack_seq;
 726         } else {
 727                 rep.th.ack = 1;
 728                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 729                                        skb->len - (th->doff << 2));
 730         }
 731
 732         memset(&arg, 0, sizeof(arg));
 733         arg.iov[0].iov_base = (unsigned char *)&rep;
 734         arg.iov[0].iov_len  = sizeof(rep.th);
 735
 736         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 737 #ifdef CONFIG_TCP_MD5SIG
 738         rcu_read_lock();
 739         hash_location = tcp_parse_md5sig_option(th);
 740         if (sk && sk_fullsock(sk)) {
 741                 const union tcp_md5_addr *addr;
 742                 int l3index;
 743
 744                 /* sdif set, means packet ingressed via a device
 745                  * in an L3 domain and inet_iif is set to it.
 746                  */
 747                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 748                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 749                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 750         } else if (hash_location) {
 751                 const union tcp_md5_addr *addr;
 752                 int sdif = tcp_v4_sdif(skb);
 753                 int dif = inet_iif(skb);
 754                 int l3index;
 755
 756                 /*
 757                  * active side is lost. Try to find listening socket through
 758                  * source port, and then find md5 key through listening socket.
 759                  * we are not loose security here:
 760                  * Incoming packet is checked with md5 hash with finding key,
 761                  * no RST generated if md5 hash doesn't match.
 762                  */
 763                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
 764                                              NULL, 0, ip_hdr(skb)->saddr,
 765                                              th->source, ip_hdr(skb)->daddr,
 766                                              ntohs(th->source), dif, sdif);
 767                 /* don't send rst if it can't find key */
 768                 if (!sk1)
 769                         goto out;
 770
 771                 /* sdif set, means packet ingressed via a device
 772                  * in an L3 domain and dif is set to it.
 773                  */
 774                 l3index = sdif ? dif : 0;
 775                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 776                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 777                 if (!key)
 778                         goto out;
 779
 780
 781                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 782                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 783                         goto out;
 784
 785         }
 786
 787         if (key) {
 788                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 789                                    (TCPOPT_NOP << 16) |
 790                                    (TCPOPT_MD5SIG << 8) |
 791                                    TCPOLEN_MD5SIG);
 792                 /* Update length and the length the header thinks exists */
 793                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 794                 rep.th.doff = arg.iov[0].iov_len / 4;
 795
 796                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 797                                      key, ip_hdr(skb)->saddr,
 798                                      ip_hdr(skb)->daddr, &rep.th);
 799         }
 800 #endif
 801         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 802         if (rep.opt[0] == 0) {
 803                 __be32 mrst = mptcp_reset_option(skb);
 804
 805                 if (mrst) {
 806                         rep.opt[0] = mrst;
 807                         arg.iov[0].iov_len += sizeof(mrst);
 808                         rep.th.doff = arg.iov[0].iov_len / 4;
 809                 }
 810         }
 811
 812         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 813                                       ip_hdr(skb)->saddr, /* XXX */
 814                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 815         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 816         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 817
 818         /* When socket is gone, all binding information is lost.
 819          * routing might fail in this case. No choice here, if we choose to force
 820          * input interface, we will misroute in case of asymmetric route.
 821          */
 822         if (sk) {
 823                 arg.bound_dev_if = sk->sk_bound_dev_if;
 824                 if (sk_fullsock(sk))
 825                         trace_tcp_send_reset(sk, skb);
 826         }
 827
 828         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 829                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 830
 831         arg.tos = ip_hdr(skb)->tos;
 832         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 833         local_bh_disable();
 834         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 835         sock_net_set(ctl_sk, net);
 836         if (sk) {
 837                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 838                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 839                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 840                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 841                 transmit_time = tcp_transmit_time(sk);
 842                 xfrm_sk_clone_policy(ctl_sk, sk);
 843         }
 844         ip_send_unicast_reply(ctl_sk,
 845                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 846                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 847                               &arg, arg.iov[0].iov_len,
 848                               transmit_time);
 849
 850         ctl_sk->sk_mark = 0;
 851         xfrm_sk_free_policy(ctl_sk);
 852         sock_net_set(ctl_sk, &init_net);
 853         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 854         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 855         local_bh_enable();
 856
 857 #ifdef CONFIG_TCP_MD5SIG
 858 out:
 859         rcu_read_unlock();
 860 #endif
 861 }
 862
 863 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 864    outside socket context is ugly, certainly. What can I do?
 865  */
 866
 867 static void tcp_v4_send_ack(const struct sock *sk,
 868                             struct sk_buff *skb, u32 seq, u32 ack,
 869                             u32 win, u32 tsval, u32 tsecr, int oif,
 870                             struct tcp_md5sig_key *key,
 871                             int reply_flags, u8 tos)
 872 {
 873         const struct tcphdr *th = tcp_hdr(skb);
 874         struct {
 875                 struct tcphdr th;
 876                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 877 #ifdef CONFIG_TCP_MD5SIG
 878                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 879 #endif
 880                         ];
 881         } rep;
 882         struct net *net = sock_net(sk);
 883         struct ip_reply_arg arg;
 884         struct sock *ctl_sk;
 885         u64 transmit_time;
 886
 887         memset(&rep.th, 0, sizeof(struct tcphdr));
 888         memset(&arg, 0, sizeof(arg));
 889
 890         arg.iov[0].iov_base = (unsigned char *)&rep;
 891         arg.iov[0].iov_len  = sizeof(rep.th);
 892         if (tsecr) {
 893                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 894                                    (TCPOPT_TIMESTAMP << 8) |
 895                                    TCPOLEN_TIMESTAMP);
 896                 rep.opt[1] = htonl(tsval);
 897                 rep.opt[2] = htonl(tsecr);
 898                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 899         }
 900
 901         /* Swap the send and the receive. */
 902         rep.th.dest    = th->source;
 903         rep.th.source  = th->dest;
 904         rep.th.doff    = arg.iov[0].iov_len / 4;
 905         rep.th.seq     = htonl(seq);
 906         rep.th.ack_seq = htonl(ack);
 907         rep.th.ack     = 1;
 908         rep.th.window  = htons(win);
 909
 910 #ifdef CONFIG_TCP_MD5SIG
 911         if (key) {
 912                 int offset = (tsecr) ? 3 : 0;
 913
 914                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 915                                           (TCPOPT_NOP << 16) |
 916                                           (TCPOPT_MD5SIG << 8) |
 917                                           TCPOLEN_MD5SIG);
 918                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 919                 rep.th.doff = arg.iov[0].iov_len/4;
 920
 921                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 922                                     key, ip_hdr(skb)->saddr,
 923                                     ip_hdr(skb)->daddr, &rep.th);
 924         }
 925 #endif
 926         arg.flags = reply_flags;
 927         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 928                                       ip_hdr(skb)->saddr, /* XXX */
 929                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 930         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 931         if (oif)
 932                 arg.bound_dev_if = oif;
 933         arg.tos = tos;
 934         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 935         local_bh_disable();
 936         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 937         sock_net_set(ctl_sk, net);
 938         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 939                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 940         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 941                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 942         transmit_time = tcp_transmit_time(sk);
 943         ip_send_unicast_reply(ctl_sk,
 944                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 945                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 946                               &arg, arg.iov[0].iov_len,
 947                               transmit_time);
 948
 949         ctl_sk->sk_mark = 0;
 950         sock_net_set(ctl_sk, &init_net);
 951         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 952         local_bh_enable();
 953 }
 954
 955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 956 {
 957         struct inet_timewait_sock *tw = inet_twsk(sk);
 958         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 959
 960         tcp_v4_send_ack(sk, skb,
 961                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 962                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 963                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 964                         tcptw->tw_ts_recent,
 965                         tw->tw_bound_dev_if,
 966                         tcp_twsk_md5_key(tcptw),
 967                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 968                         tw->tw_tos
 969                         );
 970
 971         inet_twsk_put(tw);
 972 }
 973
 974 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 975                                   struct request_sock *req)
 976 {
 977         const union tcp_md5_addr *addr;
 978         int l3index;
 979
 980         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 981          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 982          */
 983         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 984                                              tcp_sk(sk)->snd_nxt;
 985
 986         /* RFC 7323 2.3
 987          * The window field (SEG.WND) of every outgoing segment, with the
 988          * exception of <SYN> segments, MUST be right-shifted by
 989          * Rcv.Wind.Shift bits:
 990          */
 991         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 992         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 993         tcp_v4_send_ack(sk, skb, seq,
 994                         tcp_rsk(req)->rcv_nxt,
 995                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 996                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 997                         req->ts_recent,
 998                         0,
 999                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
1000                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1001                         ip_hdr(skb)->tos);
1002 }
1003
1004 /*
1005  *      Send a SYN-ACK after having received a SYN.
1006  *      This still operates on a request_sock only, not on a big
1007  *      socket.
1008  */
1009 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1010                               struct flowi *fl,
1011                               struct request_sock *req,
1012                               struct tcp_fastopen_cookie *foc,
1013                               enum tcp_synack_type synack_type,
1014                               struct sk_buff *syn_skb)
1015 {
1016         const struct inet_request_sock *ireq = inet_rsk(req);
1017         struct flowi4 fl4;
1018         int err = -1;
1019         struct sk_buff *skb;
1020         u8 tos;
1021
1022         /* First, grab a route. */
1023         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1024                 return -1;
1025
1026         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1027
1028         if (skb) {
1029                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1030
1031                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1032                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1033                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1034                                 inet_sk(sk)->tos;
1035
1036                 if (!INET_ECN_is_capable(tos) &&
1037                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1038                         tos |= INET_ECN_ECT_0;
1039
1040                 rcu_read_lock();
1041                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1042                                             ireq->ir_rmt_addr,
1043                                             rcu_dereference(ireq->ireq_opt),
1044                                             tos);
1045                 rcu_read_unlock();
1046                 err = net_xmit_eval(err);
1047         }
1048
1049         return err;
1050 }
1051
1052 /*
1053  *      IPv4 request_sock destructor.
1054  */
1055 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1056 {
1057         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1058 }
1059
1060 #ifdef CONFIG_TCP_MD5SIG
1061 /*
1062  * RFC2385 MD5 checksumming requires a mapping of
1063  * IP address->MD5 Key.
1064  * We need to maintain these in the sk structure.
1065  */
1066
1067 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1068 EXPORT_SYMBOL(tcp_md5_needed);
1069
1070 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1071 {
1072         if (!old)
1073                 return true;
1074
1075         /* l3index always overrides non-l3index */
1076         if (old->l3index && new->l3index == 0)
1077                 return false;
1078         if (old->l3index == 0 && new->l3index)
1079                 return true;
1080
1081         return old->prefixlen < new->prefixlen;
1082 }
1083
1084 /* Find the Key structure for an address.  */
1085 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1086                                            const union tcp_md5_addr *addr,
1087                                            int family)
1088 {
1089         const struct tcp_sock *tp = tcp_sk(sk);
1090         struct tcp_md5sig_key *key;
1091         const struct tcp_md5sig_info *md5sig;
1092         __be32 mask;
1093         struct tcp_md5sig_key *best_match = NULL;
1094         bool match;
1095
1096         /* caller either holds rcu_read_lock() or socket lock */
1097         md5sig = rcu_dereference_check(tp->md5sig_info,
1098                                        lockdep_sock_is_held(sk));
1099         if (!md5sig)
1100                 return NULL;
1101
1102         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1103                                  lockdep_sock_is_held(sk)) {
1104                 if (key->family != family)
1105                         continue;
1106                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1107                         continue;
1108                 if (family == AF_INET) {
1109                         mask = inet_make_mask(key->prefixlen);
1110                         match = (key->addr.a4.s_addr & mask) ==
1111                                 (addr->a4.s_addr & mask);
1112 #if IS_ENABLED(CONFIG_IPV6)
1113                 } else if (family == AF_INET6) {
1114                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1115                                                   key->prefixlen);
1116 #endif
1117                 } else {
1118                         match = false;
1119                 }
1120
1121                 if (match && better_md5_match(best_match, key))
1122                         best_match = key;
1123         }
1124         return best_match;
1125 }
1126 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1127
1128 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1129                                                       const union tcp_md5_addr *addr,
1130                                                       int family, u8 prefixlen,
1131                                                       int l3index, u8 flags)
1132 {
1133         const struct tcp_sock *tp = tcp_sk(sk);
1134         struct tcp_md5sig_key *key;
1135         unsigned int size = sizeof(struct in_addr);
1136         const struct tcp_md5sig_info *md5sig;
1137
1138         /* caller either holds rcu_read_lock() or socket lock */
1139         md5sig = rcu_dereference_check(tp->md5sig_info,
1140                                        lockdep_sock_is_held(sk));
1141         if (!md5sig)
1142                 return NULL;
1143 #if IS_ENABLED(CONFIG_IPV6)
1144         if (family == AF_INET6)
1145                 size = sizeof(struct in6_addr);
1146 #endif
1147         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1148                                  lockdep_sock_is_held(sk)) {
1149                 if (key->family != family)
1150                         continue;
1151                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1152                         continue;
1153                 if (key->l3index != l3index)
1154                         continue;
1155                 if (!memcmp(&key->addr, addr, size) &&
1156                     key->prefixlen == prefixlen)
1157                         return key;
1158         }
1159         return NULL;
1160 }
1161
1162 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1163                                          const struct sock *addr_sk)
1164 {
1165         const union tcp_md5_addr *addr;
1166         int l3index;
1167
1168         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1169                                                  addr_sk->sk_bound_dev_if);
1170         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1171         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1172 }
1173 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1174
1175 /* This can be called on a newly created socket, from other files */
1176 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1177                    int family, u8 prefixlen, int l3index, u8 flags,
1178                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1179 {
1180         /* Add Key to the list */
1181         struct tcp_md5sig_key *key;
1182         struct tcp_sock *tp = tcp_sk(sk);
1183         struct tcp_md5sig_info *md5sig;
1184
1185         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1186         if (key) {
1187                 /* Pre-existing entry - just update that one.
1188                  * Note that the key might be used concurrently.
1189                  * data_race() is telling kcsan that we do not care of
1190                  * key mismatches, since changing MD5 key on live flows
1191                  * can lead to packet drops.
1192                  */
1193                 data_race(memcpy(key->key, newkey, newkeylen));
1194
1195                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1196                  * Also note that a reader could catch new key->keylen value
1197                  * but old key->key[], this is the reason we use __GFP_ZERO
1198                  * at sock_kmalloc() time below these lines.
1199                  */
1200                 WRITE_ONCE(key->keylen, newkeylen);
1201
1202                 return 0;
1203         }
1204
1205         md5sig = rcu_dereference_protected(tp->md5sig_info,
1206                                            lockdep_sock_is_held(sk));
1207         if (!md5sig) {
1208                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1209                 if (!md5sig)
1210                         return -ENOMEM;
1211
1212                 sk_gso_disable(sk);
1213                 INIT_HLIST_HEAD(&md5sig->head);
1214                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1215         }
1216
1217         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1218         if (!key)
1219                 return -ENOMEM;
1220         if (!tcp_alloc_md5sig_pool()) {
1221                 sock_kfree_s(sk, key, sizeof(*key));
1222                 return -ENOMEM;
1223         }
1224
1225         memcpy(key->key, newkey, newkeylen);
1226         key->keylen = newkeylen;
1227         key->family = family;
1228         key->prefixlen = prefixlen;
1229         key->l3index = l3index;
1230         key->flags = flags;
1231         memcpy(&key->addr, addr,
1232                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1233                                                                  sizeof(struct in_addr));
1234         hlist_add_head_rcu(&key->node, &md5sig->head);
1235         return 0;
1236 }
1237 EXPORT_SYMBOL(tcp_md5_do_add);
1238
1239 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1240                    u8 prefixlen, int l3index, u8 flags)
1241 {
1242         struct tcp_md5sig_key *key;
1243
1244         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1245         if (!key)
1246                 return -ENOENT;
1247         hlist_del_rcu(&key->node);
1248         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1249         kfree_rcu(key, rcu);
1250         return 0;
1251 }
1252 EXPORT_SYMBOL(tcp_md5_do_del);
1253
1254 static void tcp_clear_md5_list(struct sock *sk)
1255 {
1256         struct tcp_sock *tp = tcp_sk(sk);
1257         struct tcp_md5sig_key *key;
1258         struct hlist_node *n;
1259         struct tcp_md5sig_info *md5sig;
1260
1261         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1262
1263         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1264                 hlist_del_rcu(&key->node);
1265                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1266                 kfree_rcu(key, rcu);
1267         }
1268 }
1269
1270 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1271                                  sockptr_t optval, int optlen)
1272 {
1273         struct tcp_md5sig cmd;
1274         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1275         const union tcp_md5_addr *addr;
1276         u8 prefixlen = 32;
1277         int l3index = 0;
1278         u8 flags;
1279
1280         if (optlen < sizeof(cmd))
1281                 return -EINVAL;
1282
1283         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1284                 return -EFAULT;
1285
1286         if (sin->sin_family != AF_INET)
1287                 return -EINVAL;
1288
1289         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1290
1291         if (optname == TCP_MD5SIG_EXT &&
1292             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1293                 prefixlen = cmd.tcpm_prefixlen;
1294                 if (prefixlen > 32)
1295                         return -EINVAL;
1296         }
1297
1298         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1299             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1300                 struct net_device *dev;
1301
1302                 rcu_read_lock();
1303                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1304                 if (dev && netif_is_l3_master(dev))
1305                         l3index = dev->ifindex;
1306
1307                 rcu_read_unlock();
1308
1309                 /* ok to reference set/not set outside of rcu;
1310                  * right now device MUST be an L3 master
1311                  */
1312                 if (!dev || !l3index)
1313                         return -EINVAL;
1314         }
1315
1316         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1317
1318         if (!cmd.tcpm_keylen)
1319                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1320
1321         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1322                 return -EINVAL;
1323
1324         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1325                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1326 }
1327
1328 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1329                                    __be32 daddr, __be32 saddr,
1330                                    const struct tcphdr *th, int nbytes)
1331 {
1332         struct tcp4_pseudohdr *bp;
1333         struct scatterlist sg;
1334         struct tcphdr *_th;
1335
1336         bp = hp->scratch;
1337         bp->saddr = saddr;
1338         bp->daddr = daddr;
1339         bp->pad = 0;
1340         bp->protocol = IPPROTO_TCP;
1341         bp->len = cpu_to_be16(nbytes);
1342
1343         _th = (struct tcphdr *)(bp + 1);
1344         memcpy(_th, th, sizeof(*th));
1345         _th->check = 0;
1346
1347         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1348         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1349                                 sizeof(*bp) + sizeof(*th));
1350         return crypto_ahash_update(hp->md5_req);
1351 }
1352
1353 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1354                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1355 {
1356         struct tcp_md5sig_pool *hp;
1357         struct ahash_request *req;
1358
1359         hp = tcp_get_md5sig_pool();
1360         if (!hp)
1361                 goto clear_hash_noput;
1362         req = hp->md5_req;
1363
1364         if (crypto_ahash_init(req))
1365                 goto clear_hash;
1366         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1367                 goto clear_hash;
1368         if (tcp_md5_hash_key(hp, key))
1369                 goto clear_hash;
1370         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1371         if (crypto_ahash_final(req))
1372                 goto clear_hash;
1373
1374         tcp_put_md5sig_pool();
1375         return 0;
1376
1377 clear_hash:
1378         tcp_put_md5sig_pool();
1379 clear_hash_noput:
1380         memset(md5_hash, 0, 16);
1381         return 1;
1382 }
1383
1384 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1385                         const struct sock *sk,
1386                         const struct sk_buff *skb)
1387 {
1388         struct tcp_md5sig_pool *hp;
1389         struct ahash_request *req;
1390         const struct tcphdr *th = tcp_hdr(skb);
1391         __be32 saddr, daddr;
1392
1393         if (sk) { /* valid for establish/request sockets */
1394                 saddr = sk->sk_rcv_saddr;
1395                 daddr = sk->sk_daddr;
1396         } else {
1397                 const struct iphdr *iph = ip_hdr(skb);
1398                 saddr = iph->saddr;
1399                 daddr = iph->daddr;
1400         }
1401
1402         hp = tcp_get_md5sig_pool();
1403         if (!hp)
1404                 goto clear_hash_noput;
1405         req = hp->md5_req;
1406
1407         if (crypto_ahash_init(req))
1408                 goto clear_hash;
1409
1410         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1411                 goto clear_hash;
1412         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1413                 goto clear_hash;
1414         if (tcp_md5_hash_key(hp, key))
1415                 goto clear_hash;
1416         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1417         if (crypto_ahash_final(req))
1418                 goto clear_hash;
1419
1420         tcp_put_md5sig_pool();
1421         return 0;
1422
1423 clear_hash:
1424         tcp_put_md5sig_pool();
1425 clear_hash_noput:
1426         memset(md5_hash, 0, 16);
1427         return 1;
1428 }
1429 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1430
1431 #endif
1432
1433 static void tcp_v4_init_req(struct request_sock *req,
1434                             const struct sock *sk_listener,
1435                             struct sk_buff *skb)
1436 {
1437         struct inet_request_sock *ireq = inet_rsk(req);
1438         struct net *net = sock_net(sk_listener);
1439
1440         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1441         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1442         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1443 }
1444
1445 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1446                                           struct sk_buff *skb,
1447                                           struct flowi *fl,
1448                                           struct request_sock *req)
1449 {
1450         tcp_v4_init_req(req, sk, skb);
1451
1452         if (security_inet_conn_request(sk, skb, req))
1453                 return NULL;
1454
1455         return inet_csk_route_req(sk, &fl->u.ip4, req);
1456 }
1457
1458 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1459         .family         =       PF_INET,
1460         .obj_size       =       sizeof(struct tcp_request_sock),
1461         .rtx_syn_ack    =       tcp_rtx_synack,
1462         .send_ack       =       tcp_v4_reqsk_send_ack,
1463         .destructor     =       tcp_v4_reqsk_destructor,
1464         .send_reset     =       tcp_v4_send_reset,
1465         .syn_ack_timeout =      tcp_syn_ack_timeout,
1466 };
1467
1468 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1469         .mss_clamp      =       TCP_MSS_DEFAULT,
1470 #ifdef CONFIG_TCP_MD5SIG
1471         .req_md5_lookup =       tcp_v4_md5_lookup,
1472         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1473 #endif
1474 #ifdef CONFIG_SYN_COOKIES
1475         .cookie_init_seq =      cookie_v4_init_sequence,
1476 #endif
1477         .route_req      =       tcp_v4_route_req,
1478         .init_seq       =       tcp_v4_init_seq,
1479         .init_ts_off    =       tcp_v4_init_ts_off,
1480         .send_synack    =       tcp_v4_send_synack,
1481 };
1482
1483 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1484 {
1485         /* Never answer to SYNs send to broadcast or multicast */
1486         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1487                 goto drop;
1488
1489         return tcp_conn_request(&tcp_request_sock_ops,
1490                                 &tcp_request_sock_ipv4_ops, sk, skb);
1491
1492 drop:
1493         tcp_listendrop(sk);
1494         return 0;
1495 }
1496 EXPORT_SYMBOL(tcp_v4_conn_request);
1497
1498
1499 /*
1500  * The three way handshake has completed - we got a valid synack -
1501  * now create the new socket.
1502  */
1503 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1504                                   struct request_sock *req,
1505                                   struct dst_entry *dst,
1506                                   struct request_sock *req_unhash,
1507                                   bool *own_req)
1508 {
1509         struct inet_request_sock *ireq;
1510         bool found_dup_sk = false;
1511         struct inet_sock *newinet;
1512         struct tcp_sock *newtp;
1513         struct sock *newsk;
1514 #ifdef CONFIG_TCP_MD5SIG
1515         const union tcp_md5_addr *addr;
1516         struct tcp_md5sig_key *key;
1517         int l3index;
1518 #endif
1519         struct ip_options_rcu *inet_opt;
1520
1521         if (sk_acceptq_is_full(sk))
1522                 goto exit_overflow;
1523
1524         newsk = tcp_create_openreq_child(sk, req, skb);
1525         if (!newsk)
1526                 goto exit_nonewsk;
1527
1528         newsk->sk_gso_type = SKB_GSO_TCPV4;
1529         inet_sk_rx_dst_set(newsk, skb);
1530
1531         newtp                 = tcp_sk(newsk);
1532         newinet               = inet_sk(newsk);
1533         ireq                  = inet_rsk(req);
1534         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1535         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1536         newsk->sk_bound_dev_if = ireq->ir_iif;
1537         newinet->inet_saddr   = ireq->ir_loc_addr;
1538         inet_opt              = rcu_dereference(ireq->ireq_opt);
1539         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1540         newinet->mc_index     = inet_iif(skb);
1541         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1542         newinet->rcv_tos      = ip_hdr(skb)->tos;
1543         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1544         if (inet_opt)
1545                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1546         newinet->inet_id = get_random_u16();
1547
1548         /* Set ToS of the new socket based upon the value of incoming SYN.
1549          * ECT bits are set later in tcp_init_transfer().
1550          */
1551         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1552                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1553
1554         if (!dst) {
1555                 dst = inet_csk_route_child_sock(sk, newsk, req);
1556                 if (!dst)
1557                         goto put_and_exit;
1558         } else {
1559                 /* syncookie case : see end of cookie_v4_check() */
1560         }
1561         sk_setup_caps(newsk, dst);
1562
1563         tcp_ca_openreq_child(newsk, dst);
1564
1565         tcp_sync_mss(newsk, dst_mtu(dst));
1566         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1567
1568         tcp_initialize_rcv_mss(newsk);
1569
1570 #ifdef CONFIG_TCP_MD5SIG
1571         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1572         /* Copy over the MD5 key from the original socket */
1573         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1574         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1575         if (key) {
1576                 /*
1577                  * We're using one, so create a matching key
1578                  * on the newsk structure. If we fail to get
1579                  * memory, then we end up not copying the key
1580                  * across. Shucks.
1581                  */
1582                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1583                                key->key, key->keylen, GFP_ATOMIC);
1584                 sk_gso_disable(newsk);
1585         }
1586 #endif
1587
1588         if (__inet_inherit_port(sk, newsk) < 0)
1589                 goto put_and_exit;
1590         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1591                                        &found_dup_sk);
1592         if (likely(*own_req)) {
1593                 tcp_move_syn(newtp, req);
1594                 ireq->ireq_opt = NULL;
1595         } else {
1596                 newinet->inet_opt = NULL;
1597
1598                 if (!req_unhash && found_dup_sk) {
1599                         /* This code path should only be executed in the
1600                          * syncookie case only
1601                          */
1602                         bh_unlock_sock(newsk);
1603                         sock_put(newsk);
1604                         newsk = NULL;
1605                 }
1606         }
1607         return newsk;
1608
1609 exit_overflow:
1610         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1611 exit_nonewsk:
1612         dst_release(dst);
1613 exit:
1614         tcp_listendrop(sk);
1615         return NULL;
1616 put_and_exit:
1617         newinet->inet_opt = NULL;
1618         inet_csk_prepare_forced_close(newsk);
1619         tcp_done(newsk);
1620         goto exit;
1621 }
1622 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1623
1624 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1625 {
1626 #ifdef CONFIG_SYN_COOKIES
1627         const struct tcphdr *th = tcp_hdr(skb);
1628
1629         if (!th->syn)
1630                 sk = cookie_v4_check(sk, skb);
1631 #endif
1632         return sk;
1633 }
1634
1635 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1636                          struct tcphdr *th, u32 *cookie)
1637 {
1638         u16 mss = 0;
1639 #ifdef CONFIG_SYN_COOKIES
1640         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1641                                     &tcp_request_sock_ipv4_ops, sk, th);
1642         if (mss) {
1643                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1644                 tcp_synq_overflow(sk);
1645         }
1646 #endif
1647         return mss;
1648 }
1649
1650 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1651                                                            u32));
1652 /* The socket must have it's spinlock held when we get
1653  * here, unless it is a TCP_LISTEN socket.
1654  *
1655  * We have a potential double-lock case here, so even when
1656  * doing backlog processing we use the BH locking scheme.
1657  * This is because we cannot sleep with the original spinlock
1658  * held.
1659  */
1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1661 {
1662         enum skb_drop_reason reason;
1663         struct sock *rsk;
1664
1665         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1666                 struct dst_entry *dst;
1667
1668                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1669                                                 lockdep_sock_is_held(sk));
1670
1671                 sock_rps_save_rxhash(sk, skb);
1672                 sk_mark_napi_id(sk, skb);
1673                 if (dst) {
1674                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1675                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1676                                              dst, 0)) {
1677                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1678                                 dst_release(dst);
1679                         }
1680                 }
1681                 tcp_rcv_established(sk, skb);
1682                 return 0;
1683         }
1684
1685         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1686         if (tcp_checksum_complete(skb))
1687                 goto csum_err;
1688
1689         if (sk->sk_state == TCP_LISTEN) {
1690                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1691
1692                 if (!nsk)
1693                         goto discard;
1694                 if (nsk != sk) {
1695                         if (tcp_child_process(sk, nsk, skb)) {
1696                                 rsk = nsk;
1697                                 goto reset;
1698                         }
1699                         return 0;
1700                 }
1701         } else
1702                 sock_rps_save_rxhash(sk, skb);
1703
1704         if (tcp_rcv_state_process(sk, skb)) {
1705                 rsk = sk;
1706                 goto reset;
1707         }
1708         return 0;
1709
1710 reset:
1711         tcp_v4_send_reset(rsk, skb);
1712 discard:
1713         kfree_skb_reason(skb, reason);
1714         /* Be careful here. If this function gets more complicated and
1715          * gcc suffers from register pressure on the x86, sk (in %ebx)
1716          * might be destroyed here. This current version compiles correctly,
1717          * but you have been warned.
1718          */
1719         return 0;
1720
1721 csum_err:
1722         reason = SKB_DROP_REASON_TCP_CSUM;
1723         trace_tcp_bad_csum(skb);
1724         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1725         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1726         goto discard;
1727 }
1728 EXPORT_SYMBOL(tcp_v4_do_rcv);
1729
1730 int tcp_v4_early_demux(struct sk_buff *skb)
1731 {
1732         struct net *net = dev_net(skb->dev);
1733         const struct iphdr *iph;
1734         const struct tcphdr *th;
1735         struct sock *sk;
1736
1737         if (skb->pkt_type != PACKET_HOST)
1738                 return 0;
1739
1740         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1741                 return 0;
1742
1743         iph = ip_hdr(skb);
1744         th = tcp_hdr(skb);
1745
1746         if (th->doff < sizeof(struct tcphdr) / 4)
1747                 return 0;
1748
1749         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1750                                        iph->saddr, th->source,
1751                                        iph->daddr, ntohs(th->dest),
1752                                        skb->skb_iif, inet_sdif(skb));
1753         if (sk) {
1754                 skb->sk = sk;
1755                 skb->destructor = sock_edemux;
1756                 if (sk_fullsock(sk)) {
1757                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1758
1759                         if (dst)
1760                                 dst = dst_check(dst, 0);
1761                         if (dst &&
1762                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1763                                 skb_dst_set_noref(skb, dst);
1764                 }
1765         }
1766         return 0;
1767 }
1768
1769 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1770                      enum skb_drop_reason *reason)
1771 {
1772         u32 limit, tail_gso_size, tail_gso_segs;
1773         struct skb_shared_info *shinfo;
1774         const struct tcphdr *th;
1775         struct tcphdr *thtail;
1776         struct sk_buff *tail;
1777         unsigned int hdrlen;
1778         bool fragstolen;
1779         u32 gso_segs;
1780         u32 gso_size;
1781         int delta;
1782
1783         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1784          * we can fix skb->truesize to its real value to avoid future drops.
1785          * This is valid because skb is not yet charged to the socket.
1786          * It has been noticed pure SACK packets were sometimes dropped
1787          * (if cooked by drivers without copybreak feature).
1788          */
1789         skb_condense(skb);
1790
1791         skb_dst_drop(skb);
1792
1793         if (unlikely(tcp_checksum_complete(skb))) {
1794                 bh_unlock_sock(sk);
1795                 trace_tcp_bad_csum(skb);
1796                 *reason = SKB_DROP_REASON_TCP_CSUM;
1797                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1798                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1799                 return true;
1800         }
1801
1802         /* Attempt coalescing to last skb in backlog, even if we are
1803          * above the limits.
1804          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1805          */
1806         th = (const struct tcphdr *)skb->data;
1807         hdrlen = th->doff * 4;
1808
1809         tail = sk->sk_backlog.tail;
1810         if (!tail)
1811                 goto no_coalesce;
1812         thtail = (struct tcphdr *)tail->data;
1813
1814         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1815             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1816             ((TCP_SKB_CB(tail)->tcp_flags |
1817               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1818             !((TCP_SKB_CB(tail)->tcp_flags &
1819               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1820             ((TCP_SKB_CB(tail)->tcp_flags ^
1821               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1822 #ifdef CONFIG_TLS_DEVICE
1823             tail->decrypted != skb->decrypted ||
1824 #endif
1825             thtail->doff != th->doff ||
1826             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1827                 goto no_coalesce;
1828
1829         __skb_pull(skb, hdrlen);
1830
1831         shinfo = skb_shinfo(skb);
1832         gso_size = shinfo->gso_size ?: skb->len;
1833         gso_segs = shinfo->gso_segs ?: 1;
1834
1835         shinfo = skb_shinfo(tail);
1836         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1837         tail_gso_segs = shinfo->gso_segs ?: 1;
1838
1839         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1840                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1841
1842                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1843                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1844                         thtail->window = th->window;
1845                 }
1846
1847                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1848                  * thtail->fin, so that the fast path in tcp_rcv_established()
1849                  * is not entered if we append a packet with a FIN.
1850                  * SYN, RST, URG are not present.
1851                  * ACK is set on both packets.
1852                  * PSH : we do not really care in TCP stack,
1853                  *       at least for 'GRO' packets.
1854                  */
1855                 thtail->fin |= th->fin;
1856                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1857
1858                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1859                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1860                         tail->tstamp = skb->tstamp;
1861                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1862                 }
1863
1864                 /* Not as strict as GRO. We only need to carry mss max value */
1865                 shinfo->gso_size = max(gso_size, tail_gso_size);
1866                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1867
1868                 sk->sk_backlog.len += delta;
1869                 __NET_INC_STATS(sock_net(sk),
1870                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1871                 kfree_skb_partial(skb, fragstolen);
1872                 return false;
1873         }
1874         __skb_push(skb, hdrlen);
1875
1876 no_coalesce:
1877         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1878
1879         /* Only socket owner can try to collapse/prune rx queues
1880          * to reduce memory overhead, so add a little headroom here.
1881          * Few sockets backlog are possibly concurrently non empty.
1882          */
1883         limit += 64 * 1024;
1884
1885         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1886                 bh_unlock_sock(sk);
1887                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1888                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1889                 return true;
1890         }
1891         return false;
1892 }
1893 EXPORT_SYMBOL(tcp_add_backlog);
1894
1895 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1896 {
1897         struct tcphdr *th = (struct tcphdr *)skb->data;
1898
1899         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1900 }
1901 EXPORT_SYMBOL(tcp_filter);
1902
1903 static void tcp_v4_restore_cb(struct sk_buff *skb)
1904 {
1905         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1906                 sizeof(struct inet_skb_parm));
1907 }
1908
1909 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1910                            const struct tcphdr *th)
1911 {
1912         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1913          * barrier() makes sure compiler wont play fool^Waliasing games.
1914          */
1915         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1916                 sizeof(struct inet_skb_parm));
1917         barrier();
1918
1919         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1920         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1921                                     skb->len - th->doff * 4);
1922         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1923         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1924         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1925         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1926         TCP_SKB_CB(skb)->sacked  = 0;
1927         TCP_SKB_CB(skb)->has_rxtstamp =
1928                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1929 }
1930
1931 /*
1932  *      From tcp_input.c
1933  */
1934
1935 int tcp_v4_rcv(struct sk_buff *skb)
1936 {
1937         struct net *net = dev_net(skb->dev);
1938         enum skb_drop_reason drop_reason;
1939         int sdif = inet_sdif(skb);
1940         int dif = inet_iif(skb);
1941         const struct iphdr *iph;
1942         const struct tcphdr *th;
1943         bool refcounted;
1944         struct sock *sk;
1945         int ret;
1946
1947         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1948         if (skb->pkt_type != PACKET_HOST)
1949                 goto discard_it;
1950
1951         /* Count it even if it's bad */
1952         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1953
1954         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1955                 goto discard_it;
1956
1957         th = (const struct tcphdr *)skb->data;
1958
1959         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1960                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1961                 goto bad_packet;
1962         }
1963         if (!pskb_may_pull(skb, th->doff * 4))
1964                 goto discard_it;
1965
1966         /* An explanation is required here, I think.
1967          * Packet length and doff are validated by header prediction,
1968          * provided case of th->doff==0 is eliminated.
1969          * So, we defer the checks. */
1970
1971         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1972                 goto csum_error;
1973
1974         th = (const struct tcphdr *)skb->data;
1975         iph = ip_hdr(skb);
1976 lookup:
1977         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1978                                skb, __tcp_hdrlen(th), th->source,
1979                                th->dest, sdif, &refcounted);
1980         if (!sk)
1981                 goto no_tcp_socket;
1982
1983 process:
1984         if (sk->sk_state == TCP_TIME_WAIT)
1985                 goto do_time_wait;
1986
1987         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1988                 struct request_sock *req = inet_reqsk(sk);
1989                 bool req_stolen = false;
1990                 struct sock *nsk;
1991
1992                 sk = req->rsk_listener;
1993                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1994                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1995                 else
1996                         drop_reason = tcp_inbound_md5_hash(sk, skb,
1997                                                    &iph->saddr, &iph->daddr,
1998                                                    AF_INET, dif, sdif);
1999                 if (unlikely(drop_reason)) {
2000                         sk_drops_add(sk, skb);
2001                         reqsk_put(req);
2002                         goto discard_it;
2003                 }
2004                 if (tcp_checksum_complete(skb)) {
2005                         reqsk_put(req);
2006                         goto csum_error;
2007                 }
2008                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2009                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2010                         if (!nsk) {
2011                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2012                                 goto lookup;
2013                         }
2014                         sk = nsk;
2015                         /* reuseport_migrate_sock() has already held one sk_refcnt
2016                          * before returning.
2017                          */
2018                 } else {
2019                         /* We own a reference on the listener, increase it again
2020                          * as we might lose it too soon.
2021                          */
2022                         sock_hold(sk);
2023                 }
2024                 refcounted = true;
2025                 nsk = NULL;
2026                 if (!tcp_filter(sk, skb)) {
2027                         th = (const struct tcphdr *)skb->data;
2028                         iph = ip_hdr(skb);
2029                         tcp_v4_fill_cb(skb, iph, th);
2030                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2031                 } else {
2032                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2033                 }
2034                 if (!nsk) {
2035                         reqsk_put(req);
2036                         if (req_stolen) {
2037                                 /* Another cpu got exclusive access to req
2038                                  * and created a full blown socket.
2039                                  * Try to feed this packet to this socket
2040                                  * instead of discarding it.
2041                                  */
2042                                 tcp_v4_restore_cb(skb);
2043                                 sock_put(sk);
2044                                 goto lookup;
2045                         }
2046                         goto discard_and_relse;
2047                 }
2048                 nf_reset_ct(skb);
2049                 if (nsk == sk) {
2050                         reqsk_put(req);
2051                         tcp_v4_restore_cb(skb);
2052                 } else if (tcp_child_process(sk, nsk, skb)) {
2053                         tcp_v4_send_reset(nsk, skb);
2054                         goto discard_and_relse;
2055                 } else {
2056                         sock_put(sk);
2057                         return 0;
2058                 }
2059         }
2060
2061         if (static_branch_unlikely(&ip4_min_ttl)) {
2062                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2063                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2064                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2065                         goto discard_and_relse;
2066                 }
2067         }
2068
2069         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2070                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2071                 goto discard_and_relse;
2072         }
2073
2074         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2075                                            &iph->daddr, AF_INET, dif, sdif);
2076         if (drop_reason)
2077                 goto discard_and_relse;
2078
2079         nf_reset_ct(skb);
2080
2081         if (tcp_filter(sk, skb)) {
2082                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2083                 goto discard_and_relse;
2084         }
2085         th = (const struct tcphdr *)skb->data;
2086         iph = ip_hdr(skb);
2087         tcp_v4_fill_cb(skb, iph, th);
2088
2089         skb->dev = NULL;
2090
2091         if (sk->sk_state == TCP_LISTEN) {
2092                 ret = tcp_v4_do_rcv(sk, skb);
2093                 goto put_and_return;
2094         }
2095
2096         sk_incoming_cpu_update(sk);
2097
2098         bh_lock_sock_nested(sk);
2099         tcp_segs_in(tcp_sk(sk), skb);
2100         ret = 0;
2101         if (!sock_owned_by_user(sk)) {
2102                 ret = tcp_v4_do_rcv(sk, skb);
2103         } else {
2104                 if (tcp_add_backlog(sk, skb, &drop_reason))
2105                         goto discard_and_relse;
2106         }
2107         bh_unlock_sock(sk);
2108
2109 put_and_return:
2110         if (refcounted)
2111                 sock_put(sk);
2112
2113         return ret;
2114
2115 no_tcp_socket:
2116         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2117         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2118                 goto discard_it;
2119
2120         tcp_v4_fill_cb(skb, iph, th);
2121
2122         if (tcp_checksum_complete(skb)) {
2123 csum_error:
2124                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2125                 trace_tcp_bad_csum(skb);
2126                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2127 bad_packet:
2128                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2129         } else {
2130                 tcp_v4_send_reset(NULL, skb);
2131         }
2132
2133 discard_it:
2134         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2135         /* Discard frame. */
2136         kfree_skb_reason(skb, drop_reason);
2137         return 0;
2138
2139 discard_and_relse:
2140         sk_drops_add(sk, skb);
2141         if (refcounted)
2142                 sock_put(sk);
2143         goto discard_it;
2144
2145 do_time_wait:
2146         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2147                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2148                 inet_twsk_put(inet_twsk(sk));
2149                 goto discard_it;
2150         }
2151
2152         tcp_v4_fill_cb(skb, iph, th);
2153
2154         if (tcp_checksum_complete(skb)) {
2155                 inet_twsk_put(inet_twsk(sk));
2156                 goto csum_error;
2157         }
2158         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2159         case TCP_TW_SYN: {
2160                 struct sock *sk2 = inet_lookup_listener(net,
2161                                                         net->ipv4.tcp_death_row.hashinfo,
2162                                                         skb, __tcp_hdrlen(th),
2163                                                         iph->saddr, th->source,
2164                                                         iph->daddr, th->dest,
2165                                                         inet_iif(skb),
2166                                                         sdif);
2167                 if (sk2) {
2168                         inet_twsk_deschedule_put(inet_twsk(sk));
2169                         sk = sk2;
2170                         tcp_v4_restore_cb(skb);
2171                         refcounted = false;
2172                         goto process;
2173                 }
2174         }
2175                 /* to ACK */
2176                 fallthrough;
2177         case TCP_TW_ACK:
2178                 tcp_v4_timewait_ack(sk, skb);
2179                 break;
2180         case TCP_TW_RST:
2181                 tcp_v4_send_reset(sk, skb);
2182                 inet_twsk_deschedule_put(inet_twsk(sk));
2183                 goto discard_it;
2184         case TCP_TW_SUCCESS:;
2185         }
2186         goto discard_it;
2187 }
2188
2189 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2190         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2191         .twsk_unique    = tcp_twsk_unique,
2192         .twsk_destructor= tcp_twsk_destructor,
2193 };
2194
2195 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2196 {
2197         struct dst_entry *dst = skb_dst(skb);
2198
2199         if (dst && dst_hold_safe(dst)) {
2200                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2201                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2202         }
2203 }
2204 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2205
2206 const struct inet_connection_sock_af_ops ipv4_specific = {
2207         .queue_xmit        = ip_queue_xmit,
2208         .send_check        = tcp_v4_send_check,
2209         .rebuild_header    = inet_sk_rebuild_header,
2210         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2211         .conn_request      = tcp_v4_conn_request,
2212         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2213         .net_header_len    = sizeof(struct iphdr),
2214         .setsockopt        = ip_setsockopt,
2215         .getsockopt        = ip_getsockopt,
2216         .addr2sockaddr     = inet_csk_addr2sockaddr,
2217         .sockaddr_len      = sizeof(struct sockaddr_in),
2218         .mtu_reduced       = tcp_v4_mtu_reduced,
2219 };
2220 EXPORT_SYMBOL(ipv4_specific);
2221
2222 #ifdef CONFIG_TCP_MD5SIG
2223 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2224         .md5_lookup             = tcp_v4_md5_lookup,
2225         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2226         .md5_parse              = tcp_v4_parse_md5_keys,
2227 };
2228 #endif
2229
2230 /* NOTE: A lot of things set to zero explicitly by call to
2231  *       sk_alloc() so need not be done here.
2232  */
2233 static int tcp_v4_init_sock(struct sock *sk)
2234 {
2235         struct inet_connection_sock *icsk = inet_csk(sk);
2236
2237         tcp_init_sock(sk);
2238
2239         icsk->icsk_af_ops = &ipv4_specific;
2240
2241 #ifdef CONFIG_TCP_MD5SIG
2242         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2243 #endif
2244
2245         return 0;
2246 }
2247
2248 void tcp_v4_destroy_sock(struct sock *sk)
2249 {
2250         struct tcp_sock *tp = tcp_sk(sk);
2251
2252         trace_tcp_destroy_sock(sk);
2253
2254         tcp_clear_xmit_timers(sk);
2255
2256         tcp_cleanup_congestion_control(sk);
2257
2258         tcp_cleanup_ulp(sk);
2259
2260         /* Cleanup up the write buffer. */
2261         tcp_write_queue_purge(sk);
2262
2263         /* Check if we want to disable active TFO */
2264         tcp_fastopen_active_disable_ofo_check(sk);
2265
2266         /* Cleans up our, hopefully empty, out_of_order_queue. */
2267         skb_rbtree_purge(&tp->out_of_order_queue);
2268
2269 #ifdef CONFIG_TCP_MD5SIG
2270         /* Clean up the MD5 key list, if any */
2271         if (tp->md5sig_info) {
2272                 tcp_clear_md5_list(sk);
2273                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2274                 tp->md5sig_info = NULL;
2275         }
2276 #endif
2277
2278         /* Clean up a referenced TCP bind bucket. */
2279         if (inet_csk(sk)->icsk_bind_hash)
2280                 inet_put_port(sk);
2281
2282         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2283
2284         /* If socket is aborted during connect operation */
2285         tcp_free_fastopen_req(tp);
2286         tcp_fastopen_destroy_cipher(sk);
2287         tcp_saved_syn_free(tp);
2288
2289         sk_sockets_allocated_dec(sk);
2290 }
2291 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2292
2293 #ifdef CONFIG_PROC_FS
2294 /* Proc filesystem TCP sock list dumping. */
2295
2296 static unsigned short seq_file_family(const struct seq_file *seq);
2297
2298 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2299 {
2300         unsigned short family = seq_file_family(seq);
2301
2302         /* AF_UNSPEC is used as a match all */
2303         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2304                 net_eq(sock_net(sk), seq_file_net(seq)));
2305 }
2306
2307 /* Find a non empty bucket (starting from st->bucket)
2308  * and return the first sk from it.
2309  */
2310 static void *listening_get_first(struct seq_file *seq)
2311 {
2312         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2313         struct tcp_iter_state *st = seq->private;
2314
2315         st->offset = 0;
2316         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2317                 struct inet_listen_hashbucket *ilb2;
2318                 struct hlist_nulls_node *node;
2319                 struct sock *sk;
2320
2321                 ilb2 = &hinfo->lhash2[st->bucket];
2322                 if (hlist_nulls_empty(&ilb2->nulls_head))
2323                         continue;
2324
2325                 spin_lock(&ilb2->lock);
2326                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2327                         if (seq_sk_match(seq, sk))
2328                                 return sk;
2329                 }
2330                 spin_unlock(&ilb2->lock);
2331         }
2332
2333         return NULL;
2334 }
2335
2336 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2337  * If "cur" is the last one in the st->bucket,
2338  * call listening_get_first() to return the first sk of the next
2339  * non empty bucket.
2340  */
2341 static void *listening_get_next(struct seq_file *seq, void *cur)
2342 {
2343         struct tcp_iter_state *st = seq->private;
2344         struct inet_listen_hashbucket *ilb2;
2345         struct hlist_nulls_node *node;
2346         struct inet_hashinfo *hinfo;
2347         struct sock *sk = cur;
2348
2349         ++st->num;
2350         ++st->offset;
2351
2352         sk = sk_nulls_next(sk);
2353         sk_nulls_for_each_from(sk, node) {
2354                 if (seq_sk_match(seq, sk))
2355                         return sk;
2356         }
2357
2358         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2359         ilb2 = &hinfo->lhash2[st->bucket];
2360         spin_unlock(&ilb2->lock);
2361         ++st->bucket;
2362         return listening_get_first(seq);
2363 }
2364
2365 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2366 {
2367         struct tcp_iter_state *st = seq->private;
2368         void *rc;
2369
2370         st->bucket = 0;
2371         st->offset = 0;
2372         rc = listening_get_first(seq);
2373
2374         while (rc && *pos) {
2375                 rc = listening_get_next(seq, rc);
2376                 --*pos;
2377         }
2378         return rc;
2379 }
2380
2381 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2382                                 const struct tcp_iter_state *st)
2383 {
2384         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2385 }
2386
2387 /*
2388  * Get first established socket starting from bucket given in st->bucket.
2389  * If st->bucket is zero, the very first socket in the hash is returned.
2390  */
2391 static void *established_get_first(struct seq_file *seq)
2392 {
2393         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2394         struct tcp_iter_state *st = seq->private;
2395
2396         st->offset = 0;
2397         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2398                 struct sock *sk;
2399                 struct hlist_nulls_node *node;
2400                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2401
2402                 /* Lockless fast path for the common case of empty buckets */
2403                 if (empty_bucket(hinfo, st))
2404                         continue;
2405
2406                 spin_lock_bh(lock);
2407                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2408                         if (seq_sk_match(seq, sk))
2409                                 return sk;
2410                 }
2411                 spin_unlock_bh(lock);
2412         }
2413
2414         return NULL;
2415 }
2416
2417 static void *established_get_next(struct seq_file *seq, void *cur)
2418 {
2419         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2420         struct tcp_iter_state *st = seq->private;
2421         struct hlist_nulls_node *node;
2422         struct sock *sk = cur;
2423
2424         ++st->num;
2425         ++st->offset;
2426
2427         sk = sk_nulls_next(sk);
2428
2429         sk_nulls_for_each_from(sk, node) {
2430                 if (seq_sk_match(seq, sk))
2431                         return sk;
2432         }
2433
2434         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2435         ++st->bucket;
2436         return established_get_first(seq);
2437 }
2438
2439 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2440 {
2441         struct tcp_iter_state *st = seq->private;
2442         void *rc;
2443
2444         st->bucket = 0;
2445         rc = established_get_first(seq);
2446
2447         while (rc && pos) {
2448                 rc = established_get_next(seq, rc);
2449                 --pos;
2450         }
2451         return rc;
2452 }
2453
2454 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2455 {
2456         void *rc;
2457         struct tcp_iter_state *st = seq->private;
2458
2459         st->state = TCP_SEQ_STATE_LISTENING;
2460         rc        = listening_get_idx(seq, &pos);
2461
2462         if (!rc) {
2463                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2464                 rc        = established_get_idx(seq, pos);
2465         }
2466
2467         return rc;
2468 }
2469
2470 static void *tcp_seek_last_pos(struct seq_file *seq)
2471 {
2472         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2473         struct tcp_iter_state *st = seq->private;
2474         int bucket = st->bucket;
2475         int offset = st->offset;
2476         int orig_num = st->num;
2477         void *rc = NULL;
2478
2479         switch (st->state) {
2480         case TCP_SEQ_STATE_LISTENING:
2481                 if (st->bucket > hinfo->lhash2_mask)
2482                         break;
2483                 st->state = TCP_SEQ_STATE_LISTENING;
2484                 rc = listening_get_first(seq);
2485                 while (offset-- && rc && bucket == st->bucket)
2486                         rc = listening_get_next(seq, rc);
2487                 if (rc)
2488                         break;
2489                 st->bucket = 0;
2490                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2491                 fallthrough;
2492         case TCP_SEQ_STATE_ESTABLISHED:
2493                 if (st->bucket > hinfo->ehash_mask)
2494                         break;
2495                 rc = established_get_first(seq);
2496                 while (offset-- && rc && bucket == st->bucket)
2497                         rc = established_get_next(seq, rc);
2498         }
2499
2500         st->num = orig_num;
2501
2502         return rc;
2503 }
2504
2505 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2506 {
2507         struct tcp_iter_state *st = seq->private;
2508         void *rc;
2509
2510         if (*pos && *pos == st->last_pos) {
2511                 rc = tcp_seek_last_pos(seq);
2512                 if (rc)
2513                         goto out;
2514         }
2515
2516         st->state = TCP_SEQ_STATE_LISTENING;
2517         st->num = 0;
2518         st->bucket = 0;
2519         st->offset = 0;
2520         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2521
2522 out:
2523         st->last_pos = *pos;
2524         return rc;
2525 }
2526 EXPORT_SYMBOL(tcp_seq_start);
2527
2528 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2529 {
2530         struct tcp_iter_state *st = seq->private;
2531         void *rc = NULL;
2532
2533         if (v == SEQ_START_TOKEN) {
2534                 rc = tcp_get_idx(seq, 0);
2535                 goto out;
2536         }
2537
2538         switch (st->state) {
2539         case TCP_SEQ_STATE_LISTENING:
2540                 rc = listening_get_next(seq, v);
2541                 if (!rc) {
2542                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2543                         st->bucket = 0;
2544                         st->offset = 0;
2545                         rc        = established_get_first(seq);
2546                 }
2547                 break;
2548         case TCP_SEQ_STATE_ESTABLISHED:
2549                 rc = established_get_next(seq, v);
2550                 break;
2551         }
2552 out:
2553         ++*pos;
2554         st->last_pos = *pos;
2555         return rc;
2556 }
2557 EXPORT_SYMBOL(tcp_seq_next);
2558
2559 void tcp_seq_stop(struct seq_file *seq, void *v)
2560 {
2561         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2562         struct tcp_iter_state *st = seq->private;
2563
2564         switch (st->state) {
2565         case TCP_SEQ_STATE_LISTENING:
2566                 if (v != SEQ_START_TOKEN)
2567                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2568                 break;
2569         case TCP_SEQ_STATE_ESTABLISHED:
2570                 if (v)
2571                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2572                 break;
2573         }
2574 }
2575 EXPORT_SYMBOL(tcp_seq_stop);
2576
2577 static void get_openreq4(const struct request_sock *req,
2578                          struct seq_file *f, int i)
2579 {
2580         const struct inet_request_sock *ireq = inet_rsk(req);
2581         long delta = req->rsk_timer.expires - jiffies;
2582
2583         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2584                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2585                 i,
2586                 ireq->ir_loc_addr,
2587                 ireq->ir_num,
2588                 ireq->ir_rmt_addr,
2589                 ntohs(ireq->ir_rmt_port),
2590                 TCP_SYN_RECV,
2591                 0, 0, /* could print option size, but that is af dependent. */
2592                 1,    /* timers active (only the expire timer) */
2593                 jiffies_delta_to_clock_t(delta),
2594                 req->num_timeout,
2595                 from_kuid_munged(seq_user_ns(f),
2596                                  sock_i_uid(req->rsk_listener)),
2597                 0,  /* non standard timer */
2598                 0, /* open_requests have no inode */
2599                 0,
2600                 req);
2601 }
2602
2603 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2604 {
2605         int timer_active;
2606         unsigned long timer_expires;
2607         const struct tcp_sock *tp = tcp_sk(sk);
2608         const struct inet_connection_sock *icsk = inet_csk(sk);
2609         const struct inet_sock *inet = inet_sk(sk);
2610         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2611         __be32 dest = inet->inet_daddr;
2612         __be32 src = inet->inet_rcv_saddr;
2613         __u16 destp = ntohs(inet->inet_dport);
2614         __u16 srcp = ntohs(inet->inet_sport);
2615         int rx_queue;
2616         int state;
2617
2618         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2619             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2620             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2621                 timer_active    = 1;
2622                 timer_expires   = icsk->icsk_timeout;
2623         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2624                 timer_active    = 4;
2625                 timer_expires   = icsk->icsk_timeout;
2626         } else if (timer_pending(&sk->sk_timer)) {
2627                 timer_active    = 2;
2628                 timer_expires   = sk->sk_timer.expires;
2629         } else {
2630                 timer_active    = 0;
2631                 timer_expires = jiffies;
2632         }
2633
2634         state = inet_sk_state_load(sk);
2635         if (state == TCP_LISTEN)
2636                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2637         else
2638                 /* Because we don't lock the socket,
2639                  * we might find a transient negative value.
2640                  */
2641                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2642                                       READ_ONCE(tp->copied_seq), 0);
2643
2644         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2645                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2646                 i, src, srcp, dest, destp, state,
2647                 READ_ONCE(tp->write_seq) - tp->snd_una,
2648                 rx_queue,
2649                 timer_active,
2650                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2651                 icsk->icsk_retransmits,
2652                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2653                 icsk->icsk_probes_out,
2654                 sock_i_ino(sk),
2655                 refcount_read(&sk->sk_refcnt), sk,
2656                 jiffies_to_clock_t(icsk->icsk_rto),
2657                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2658                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2659                 tcp_snd_cwnd(tp),
2660                 state == TCP_LISTEN ?
2661                     fastopenq->max_qlen :
2662                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2663 }
2664
2665 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2666                                struct seq_file *f, int i)
2667 {
2668         long delta = tw->tw_timer.expires - jiffies;
2669         __be32 dest, src;
2670         __u16 destp, srcp;
2671
2672         dest  = tw->tw_daddr;
2673         src   = tw->tw_rcv_saddr;
2674         destp = ntohs(tw->tw_dport);
2675         srcp  = ntohs(tw->tw_sport);
2676
2677         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2678                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2679                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2680                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2681                 refcount_read(&tw->tw_refcnt), tw);
2682 }
2683
2684 #define TMPSZ 150
2685
2686 static int tcp4_seq_show(struct seq_file *seq, void *v)
2687 {
2688         struct tcp_iter_state *st;
2689         struct sock *sk = v;
2690
2691         seq_setwidth(seq, TMPSZ - 1);
2692         if (v == SEQ_START_TOKEN) {
2693                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2694                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2695                            "inode");
2696                 goto out;
2697         }
2698         st = seq->private;
2699
2700         if (sk->sk_state == TCP_TIME_WAIT)
2701                 get_timewait4_sock(v, seq, st->num);
2702         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2703                 get_openreq4(v, seq, st->num);
2704         else
2705                 get_tcp4_sock(v, seq, st->num);
2706 out:
2707         seq_pad(seq, '\n');
2708         return 0;
2709 }
2710
2711 #ifdef CONFIG_BPF_SYSCALL
2712 struct bpf_tcp_iter_state {
2713         struct tcp_iter_state state;
2714         unsigned int cur_sk;
2715         unsigned int end_sk;
2716         unsigned int max_sk;
2717         struct sock **batch;
2718         bool st_bucket_done;
2719 };
2720
2721 struct bpf_iter__tcp {
2722         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2723         __bpf_md_ptr(struct sock_common *, sk_common);
2724         uid_t uid __aligned(8);
2725 };
2726
2727 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2728                              struct sock_common *sk_common, uid_t uid)
2729 {
2730         struct bpf_iter__tcp ctx;
2731
2732         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2733         ctx.meta = meta;
2734         ctx.sk_common = sk_common;
2735         ctx.uid = uid;
2736         return bpf_iter_run_prog(prog, &ctx);
2737 }
2738
2739 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2740 {
2741         while (iter->cur_sk < iter->end_sk)
2742                 sock_put(iter->batch[iter->cur_sk++]);
2743 }
2744
2745 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2746                                       unsigned int new_batch_sz)
2747 {
2748         struct sock **new_batch;
2749
2750         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2751                              GFP_USER | __GFP_NOWARN);
2752         if (!new_batch)
2753                 return -ENOMEM;
2754
2755         bpf_iter_tcp_put_batch(iter);
2756         kvfree(iter->batch);
2757         iter->batch = new_batch;
2758         iter->max_sk = new_batch_sz;
2759
2760         return 0;
2761 }
2762
2763 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2764                                                  struct sock *start_sk)
2765 {
2766         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2767         struct bpf_tcp_iter_state *iter = seq->private;
2768         struct tcp_iter_state *st = &iter->state;
2769         struct hlist_nulls_node *node;
2770         unsigned int expected = 1;
2771         struct sock *sk;
2772
2773         sock_hold(start_sk);
2774         iter->batch[iter->end_sk++] = start_sk;
2775
2776         sk = sk_nulls_next(start_sk);
2777         sk_nulls_for_each_from(sk, node) {
2778                 if (seq_sk_match(seq, sk)) {
2779                         if (iter->end_sk < iter->max_sk) {
2780                                 sock_hold(sk);
2781                                 iter->batch[iter->end_sk++] = sk;
2782                         }
2783                         expected++;
2784                 }
2785         }
2786         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2787
2788         return expected;
2789 }
2790
2791 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2792                                                    struct sock *start_sk)
2793 {
2794         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2795         struct bpf_tcp_iter_state *iter = seq->private;
2796         struct tcp_iter_state *st = &iter->state;
2797         struct hlist_nulls_node *node;
2798         unsigned int expected = 1;
2799         struct sock *sk;
2800
2801         sock_hold(start_sk);
2802         iter->batch[iter->end_sk++] = start_sk;
2803
2804         sk = sk_nulls_next(start_sk);
2805         sk_nulls_for_each_from(sk, node) {
2806                 if (seq_sk_match(seq, sk)) {
2807                         if (iter->end_sk < iter->max_sk) {
2808                                 sock_hold(sk);
2809                                 iter->batch[iter->end_sk++] = sk;
2810                         }
2811                         expected++;
2812                 }
2813         }
2814         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2815
2816         return expected;
2817 }
2818
2819 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2820 {
2821         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2822         struct bpf_tcp_iter_state *iter = seq->private;
2823         struct tcp_iter_state *st = &iter->state;
2824         unsigned int expected;
2825         bool resized = false;
2826         struct sock *sk;
2827
2828         /* The st->bucket is done.  Directly advance to the next
2829          * bucket instead of having the tcp_seek_last_pos() to skip
2830          * one by one in the current bucket and eventually find out
2831          * it has to advance to the next bucket.
2832          */
2833         if (iter->st_bucket_done) {
2834                 st->offset = 0;
2835                 st->bucket++;
2836                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2837                     st->bucket > hinfo->lhash2_mask) {
2838                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2839                         st->bucket = 0;
2840                 }
2841         }
2842
2843 again:
2844         /* Get a new batch */
2845         iter->cur_sk = 0;
2846         iter->end_sk = 0;
2847         iter->st_bucket_done = false;
2848
2849         sk = tcp_seek_last_pos(seq);
2850         if (!sk)
2851                 return NULL; /* Done */
2852
2853         if (st->state == TCP_SEQ_STATE_LISTENING)
2854                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2855         else
2856                 expected = bpf_iter_tcp_established_batch(seq, sk);
2857
2858         if (iter->end_sk == expected) {
2859                 iter->st_bucket_done = true;
2860                 return sk;
2861         }
2862
2863         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2864                 resized = true;
2865                 goto again;
2866         }
2867
2868         return sk;
2869 }
2870
2871 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2872 {
2873         /* bpf iter does not support lseek, so it always
2874          * continue from where it was stop()-ped.
2875          */
2876         if (*pos)
2877                 return bpf_iter_tcp_batch(seq);
2878
2879         return SEQ_START_TOKEN;
2880 }
2881
2882 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2883 {
2884         struct bpf_tcp_iter_state *iter = seq->private;
2885         struct tcp_iter_state *st = &iter->state;
2886         struct sock *sk;
2887
2888         /* Whenever seq_next() is called, the iter->cur_sk is
2889          * done with seq_show(), so advance to the next sk in
2890          * the batch.
2891          */
2892         if (iter->cur_sk < iter->end_sk) {
2893                 /* Keeping st->num consistent in tcp_iter_state.
2894                  * bpf_iter_tcp does not use st->num.
2895                  * meta.seq_num is used instead.
2896                  */
2897                 st->num++;
2898                 /* Move st->offset to the next sk in the bucket such that
2899                  * the future start() will resume at st->offset in
2900                  * st->bucket.  See tcp_seek_last_pos().
2901                  */
2902                 st->offset++;
2903                 sock_put(iter->batch[iter->cur_sk++]);
2904         }
2905
2906         if (iter->cur_sk < iter->end_sk)
2907                 sk = iter->batch[iter->cur_sk];
2908         else
2909                 sk = bpf_iter_tcp_batch(seq);
2910
2911         ++*pos;
2912         /* Keeping st->last_pos consistent in tcp_iter_state.
2913          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2914          */
2915         st->last_pos = *pos;
2916         return sk;
2917 }
2918
2919 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2920 {
2921         struct bpf_iter_meta meta;
2922         struct bpf_prog *prog;
2923         struct sock *sk = v;
2924         bool slow;
2925         uid_t uid;
2926         int ret;
2927
2928         if (v == SEQ_START_TOKEN)
2929                 return 0;
2930
2931         if (sk_fullsock(sk))
2932                 slow = lock_sock_fast(sk);
2933
2934         if (unlikely(sk_unhashed(sk))) {
2935                 ret = SEQ_SKIP;
2936                 goto unlock;
2937         }
2938
2939         if (sk->sk_state == TCP_TIME_WAIT) {
2940                 uid = 0;
2941         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2942                 const struct request_sock *req = v;
2943
2944                 uid = from_kuid_munged(seq_user_ns(seq),
2945                                        sock_i_uid(req->rsk_listener));
2946         } else {
2947                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2948         }
2949
2950         meta.seq = seq;
2951         prog = bpf_iter_get_info(&meta, false);
2952         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2953
2954 unlock:
2955         if (sk_fullsock(sk))
2956                 unlock_sock_fast(sk, slow);
2957         return ret;
2958
2959 }
2960
2961 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2962 {
2963         struct bpf_tcp_iter_state *iter = seq->private;
2964         struct bpf_iter_meta meta;
2965         struct bpf_prog *prog;
2966
2967         if (!v) {
2968                 meta.seq = seq;
2969                 prog = bpf_iter_get_info(&meta, true);
2970                 if (prog)
2971                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2972         }
2973
2974         if (iter->cur_sk < iter->end_sk) {
2975                 bpf_iter_tcp_put_batch(iter);
2976                 iter->st_bucket_done = false;
2977         }
2978 }
2979
2980 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2981         .show           = bpf_iter_tcp_seq_show,
2982         .start          = bpf_iter_tcp_seq_start,
2983         .next           = bpf_iter_tcp_seq_next,
2984         .stop           = bpf_iter_tcp_seq_stop,
2985 };
2986 #endif
2987 static unsigned short seq_file_family(const struct seq_file *seq)
2988 {
2989         const struct tcp_seq_afinfo *afinfo;
2990
2991 #ifdef CONFIG_BPF_SYSCALL
2992         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2993         if (seq->op == &bpf_iter_tcp_seq_ops)
2994                 return AF_UNSPEC;
2995 #endif
2996
2997         /* Iterated from proc fs */
2998         afinfo = pde_data(file_inode(seq->file));
2999         return afinfo->family;
3000 }
3001
3002 static const struct seq_operations tcp4_seq_ops = {
3003         .show           = tcp4_seq_show,
3004         .start          = tcp_seq_start,
3005         .next           = tcp_seq_next,
3006         .stop           = tcp_seq_stop,
3007 };
3008
3009 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3010         .family         = AF_INET,
3011 };
3012
3013 static int __net_init tcp4_proc_init_net(struct net *net)
3014 {
3015         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3016                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3017                 return -ENOMEM;
3018         return 0;
3019 }
3020
3021 static void __net_exit tcp4_proc_exit_net(struct net *net)
3022 {
3023         remove_proc_entry("tcp", net->proc_net);
3024 }
3025
3026 static struct pernet_operations tcp4_net_ops = {
3027         .init = tcp4_proc_init_net,
3028         .exit = tcp4_proc_exit_net,
3029 };
3030
3031 int __init tcp4_proc_init(void)
3032 {
3033         return register_pernet_subsys(&tcp4_net_ops);
3034 }
3035
3036 void tcp4_proc_exit(void)
3037 {
3038         unregister_pernet_subsys(&tcp4_net_ops);
3039 }
3040 #endif /* CONFIG_PROC_FS */
3041
3042 /* @wake is one when sk_stream_write_space() calls us.
3043  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3044  * This mimics the strategy used in sock_def_write_space().
3045  */
3046 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3047 {
3048         const struct tcp_sock *tp = tcp_sk(sk);
3049         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3050                             READ_ONCE(tp->snd_nxt);
3051
3052         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3053 }
3054 EXPORT_SYMBOL(tcp_stream_memory_free);
3055
3056 struct proto tcp_prot = {
3057         .name                   = "TCP",
3058         .owner                  = THIS_MODULE,
3059         .close                  = tcp_close,
3060         .pre_connect            = tcp_v4_pre_connect,
3061         .connect                = tcp_v4_connect,
3062         .disconnect             = tcp_disconnect,
3063         .accept                 = inet_csk_accept,
3064         .ioctl                  = tcp_ioctl,
3065         .init                   = tcp_v4_init_sock,
3066         .destroy                = tcp_v4_destroy_sock,
3067         .shutdown               = tcp_shutdown,
3068         .setsockopt             = tcp_setsockopt,
3069         .getsockopt             = tcp_getsockopt,
3070         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3071         .keepalive              = tcp_set_keepalive,
3072         .recvmsg                = tcp_recvmsg,
3073         .sendmsg                = tcp_sendmsg,
3074         .sendpage               = tcp_sendpage,
3075         .backlog_rcv            = tcp_v4_do_rcv,
3076         .release_cb             = tcp_release_cb,
3077         .hash                   = inet_hash,
3078         .unhash                 = inet_unhash,
3079         .get_port               = inet_csk_get_port,
3080         .put_port               = inet_put_port,
3081 #ifdef CONFIG_BPF_SYSCALL
3082         .psock_update_sk_prot   = tcp_bpf_update_proto,
3083 #endif
3084         .enter_memory_pressure  = tcp_enter_memory_pressure,
3085         .leave_memory_pressure  = tcp_leave_memory_pressure,
3086         .stream_memory_free     = tcp_stream_memory_free,
3087         .sockets_allocated      = &tcp_sockets_allocated,
3088         .orphan_count           = &tcp_orphan_count,
3089
3090         .memory_allocated       = &tcp_memory_allocated,
3091         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3092
3093         .memory_pressure        = &tcp_memory_pressure,
3094         .sysctl_mem             = sysctl_tcp_mem,
3095         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3096         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3097         .max_header             = MAX_TCP_HEADER,
3098         .obj_size               = sizeof(struct tcp_sock),
3099         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3100         .twsk_prot              = &tcp_timewait_sock_ops,
3101         .rsk_prot               = &tcp_request_sock_ops,
3102         .h.hashinfo             = NULL,
3103         .no_autobind            = true,
3104         .diag_destroy           = tcp_abort,
3105 };
3106 EXPORT_SYMBOL(tcp_prot);
3107
3108 static void __net_exit tcp_sk_exit(struct net *net)
3109 {
3110         if (net->ipv4.tcp_congestion_control)
3111                 bpf_module_put(net->ipv4.tcp_congestion_control,
3112                                net->ipv4.tcp_congestion_control->owner);
3113 }
3114
3115 static void __net_init tcp_set_hashinfo(struct net *net)
3116 {
3117         struct inet_hashinfo *hinfo;
3118         unsigned int ehash_entries;
3119         struct net *old_net;
3120
3121         if (net_eq(net, &init_net))
3122                 goto fallback;
3123
3124         old_net = current->nsproxy->net_ns;
3125         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3126         if (!ehash_entries)
3127                 goto fallback;
3128
3129         ehash_entries = roundup_pow_of_two(ehash_entries);
3130         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3131         if (!hinfo) {
3132                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3133                         "for a netns, fallback to the global one\n",
3134                         ehash_entries);
3135 fallback:
3136                 hinfo = &tcp_hashinfo;
3137                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3138         }
3139
3140         net->ipv4.tcp_death_row.hashinfo = hinfo;
3141         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3142         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3143 }
3144
3145 static int __net_init tcp_sk_init(struct net *net)
3146 {
3147         net->ipv4.sysctl_tcp_ecn = 2;
3148         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3149
3150         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3151         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3152         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3153         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3154         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3155
3156         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3157         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3158         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3159
3160         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3161         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3162         net->ipv4.sysctl_tcp_syncookies = 1;
3163         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3164         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3165         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3166         net->ipv4.sysctl_tcp_orphan_retries = 0;
3167         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3168         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3169         net->ipv4.sysctl_tcp_tw_reuse = 2;
3170         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3171
3172         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3173         tcp_set_hashinfo(net);
3174
3175         net->ipv4.sysctl_tcp_sack = 1;
3176         net->ipv4.sysctl_tcp_window_scaling = 1;
3177         net->ipv4.sysctl_tcp_timestamps = 1;
3178         net->ipv4.sysctl_tcp_early_retrans = 3;
3179         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3180         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3181         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3182         net->ipv4.sysctl_tcp_max_reordering = 300;
3183         net->ipv4.sysctl_tcp_dsack = 1;
3184         net->ipv4.sysctl_tcp_app_win = 31;
3185         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3186         net->ipv4.sysctl_tcp_frto = 2;
3187         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3188         /* This limits the percentage of the congestion window which we
3189          * will allow a single TSO frame to consume.  Building TSO frames
3190          * which are too large can cause TCP streams to be bursty.
3191          */
3192         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3193         /* Default TSQ limit of 16 TSO segments */
3194         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3195
3196         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3197         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3198
3199         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3200         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3201         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3202         net->ipv4.sysctl_tcp_autocorking = 1;
3203         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3204         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3205         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3206         if (net != &init_net) {
3207                 memcpy(net->ipv4.sysctl_tcp_rmem,
3208                        init_net.ipv4.sysctl_tcp_rmem,
3209                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3210                 memcpy(net->ipv4.sysctl_tcp_wmem,
3211                        init_net.ipv4.sysctl_tcp_wmem,
3212                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3213         }
3214         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3215         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3216         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3217         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3218         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3219         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3220
3221         /* Reno is always built in */
3222         if (!net_eq(net, &init_net) &&
3223             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3224                                init_net.ipv4.tcp_congestion_control->owner))
3225                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3226         else
3227                 net->ipv4.tcp_congestion_control = &tcp_reno;
3228
3229         return 0;
3230 }
3231
3232 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3233 {
3234         struct net *net;
3235
3236         tcp_twsk_purge(net_exit_list, AF_INET);
3237
3238         list_for_each_entry(net, net_exit_list, exit_list) {
3239                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3240                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3241                 tcp_fastopen_ctx_destroy(net);
3242         }
3243 }
3244
3245 static struct pernet_operations __net_initdata tcp_sk_ops = {
3246        .init       = tcp_sk_init,
3247        .exit       = tcp_sk_exit,
3248        .exit_batch = tcp_sk_exit_batch,
3249 };
3250
3251 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3252 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3253                      struct sock_common *sk_common, uid_t uid)
3254
3255 #define INIT_BATCH_SZ 16
3256
3257 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3258 {
3259         struct bpf_tcp_iter_state *iter = priv_data;
3260         int err;
3261
3262         err = bpf_iter_init_seq_net(priv_data, aux);
3263         if (err)
3264                 return err;
3265
3266         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3267         if (err) {
3268                 bpf_iter_fini_seq_net(priv_data);
3269                 return err;
3270         }
3271
3272         return 0;
3273 }
3274
3275 static void bpf_iter_fini_tcp(void *priv_data)
3276 {
3277         struct bpf_tcp_iter_state *iter = priv_data;
3278
3279         bpf_iter_fini_seq_net(priv_data);
3280         kvfree(iter->batch);
3281 }
3282
3283 static const struct bpf_iter_seq_info tcp_seq_info = {
3284         .seq_ops                = &bpf_iter_tcp_seq_ops,
3285         .init_seq_private       = bpf_iter_init_tcp,
3286         .fini_seq_private       = bpf_iter_fini_tcp,
3287         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3288 };
3289
3290 static const struct bpf_func_proto *
3291 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3292                             const struct bpf_prog *prog)
3293 {
3294         switch (func_id) {
3295         case BPF_FUNC_setsockopt:
3296                 return &bpf_sk_setsockopt_proto;
3297         case BPF_FUNC_getsockopt:
3298                 return &bpf_sk_getsockopt_proto;
3299         default:
3300                 return NULL;
3301         }
3302 }
3303
3304 static struct bpf_iter_reg tcp_reg_info = {
3305         .target                 = "tcp",
3306         .ctx_arg_info_size      = 1,
3307         .ctx_arg_info           = {
3308                 { offsetof(struct bpf_iter__tcp, sk_common),
3309                   PTR_TO_BTF_ID_OR_NULL },
3310         },
3311         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3312         .seq_info               = &tcp_seq_info,
3313 };
3314
3315 static void __init bpf_iter_register(void)
3316 {
3317         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3318         if (bpf_iter_reg_target(&tcp_reg_info))
3319                 pr_warn("Warning: could not register bpf iterator tcp\n");
3320 }
3321
3322 #endif
3323
3324 void __init tcp_v4_init(void)
3325 {
3326         int cpu, res;
3327
3328         for_each_possible_cpu(cpu) {
3329                 struct sock *sk;
3330
3331                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3332                                            IPPROTO_TCP, &init_net);
3333                 if (res)
3334                         panic("Failed to create the TCP control socket.\n");
3335                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3336
3337                 /* Please enforce IP_DF and IPID==0 for RST and
3338                  * ACK sent in SYN-RECV and TIME-WAIT state.
3339                  */
3340                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3341
3342                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3343         }
3344         if (register_pernet_subsys(&tcp_sk_ops))
3345                 panic("Failed to create the TCP control socket.\n");
3346
3347 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3348         bpf_iter_register();
3349 #endif
3350 }