net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
 203         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 204         struct inet_timewait_death_row *tcp_death_row;
 205         __be32 daddr, nexthop, prev_sk_rcv_saddr;
 206         struct inet_sock *inet = inet_sk(sk);
 207         struct tcp_sock *tp = tcp_sk(sk);
 208         struct ip_options_rcu *inet_opt;
 209         struct net *net = sock_net(sk);
 210         __be16 orig_sport, orig_dport;
 211         struct flowi4 *fl4;
 212         struct rtable *rt;
 213         int err;
 214
 215         if (addr_len < sizeof(struct sockaddr_in))
 216                 return -EINVAL;
 217
 218         if (usin->sin_family != AF_INET)
 219                 return -EAFNOSUPPORT;
 220
 221         nexthop = daddr = usin->sin_addr.s_addr;
 222         inet_opt = rcu_dereference_protected(inet->inet_opt,
 223                                              lockdep_sock_is_held(sk));
 224         if (inet_opt && inet_opt->opt.srr) {
 225                 if (!daddr)
 226                         return -EINVAL;
 227                 nexthop = inet_opt->opt.faddr;
 228         }
 229
 230         orig_sport = inet->inet_sport;
 231         orig_dport = usin->sin_port;
 232         fl4 = &inet->cork.fl.u.ip4;
 233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 235                               orig_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 if (err == -ENETUNREACH)
 239                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 240                 return err;
 241         }
 242
 243         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 244                 ip_rt_put(rt);
 245                 return -ENETUNREACH;
 246         }
 247
 248         if (!inet_opt || !inet_opt->opt.srr)
 249                 daddr = fl4->daddr;
 250
 251         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 252
 253         if (!inet->inet_saddr) {
 254                 if (inet_csk(sk)->icsk_bind2_hash) {
 255                         prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
 256                                                                      sk, net, inet->inet_num);
 257                         prev_sk_rcv_saddr = sk->sk_rcv_saddr;
 258                 }
 259                 inet->inet_saddr = fl4->saddr;
 260         }
 261
 262         sk_rcv_saddr_set(sk, inet->inet_saddr);
 263
 264         if (prev_addr_hashbucket) {
 265                 err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
 266                 if (err) {
 267                         inet->inet_saddr = 0;
 268                         sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
 269                         ip_rt_put(rt);
 270                         return err;
 271                 }
 272         }
 273
 274         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 275                 /* Reset inherited state */
 276                 tp->rx_opt.ts_recent       = 0;
 277                 tp->rx_opt.ts_recent_stamp = 0;
 278                 if (likely(!tp->repair))
 279                         WRITE_ONCE(tp->write_seq, 0);
 280         }
 281
 282         inet->inet_dport = usin->sin_port;
 283         sk_daddr_set(sk, daddr);
 284
 285         inet_csk(sk)->icsk_ext_hdr_len = 0;
 286         if (inet_opt)
 287                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 288
 289         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 290
 291         /* Socket identity is still unknown (sport may be zero).
 292          * However we set state to SYN-SENT and not releasing socket
 293          * lock select source port, enter ourselves into the hash tables and
 294          * complete initialization after this.
 295          */
 296         tcp_set_state(sk, TCP_SYN_SENT);
 297         err = inet_hash_connect(tcp_death_row, sk);
 298         if (err)
 299                 goto failure;
 300
 301         sk_set_txhash(sk);
 302
 303         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 304                                inet->inet_sport, inet->inet_dport, sk);
 305         if (IS_ERR(rt)) {
 306                 err = PTR_ERR(rt);
 307                 rt = NULL;
 308                 goto failure;
 309         }
 310         /* OK, now commit destination to socket.  */
 311         sk->sk_gso_type = SKB_GSO_TCPV4;
 312         sk_setup_caps(sk, &rt->dst);
 313         rt = NULL;
 314
 315         if (likely(!tp->repair)) {
 316                 if (!tp->write_seq)
 317                         WRITE_ONCE(tp->write_seq,
 318                                    secure_tcp_seq(inet->inet_saddr,
 319                                                   inet->inet_daddr,
 320                                                   inet->inet_sport,
 321                                                   usin->sin_port));
 322                 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
 323                                                  inet->inet_daddr);
 324         }
 325
 326         inet->inet_id = get_random_u16();
 327
 328         if (tcp_fastopen_defer_connect(sk, &err))
 329                 return err;
 330         if (err)
 331                 goto failure;
 332
 333         err = tcp_connect(sk);
 334
 335         if (err)
 336                 goto failure;
 337
 338         return 0;
 339
 340 failure:
 341         /*
 342          * This unhashes the socket and releases the local port,
 343          * if necessary.
 344          */
 345         tcp_set_state(sk, TCP_CLOSE);
 346         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
 347                 inet_reset_saddr(sk);
 348         ip_rt_put(rt);
 349         sk->sk_route_caps = 0;
 350         inet->inet_dport = 0;
 351         return err;
 352 }
 353 EXPORT_SYMBOL(tcp_v4_connect);
 354
 355 /*
 356  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 357  * It can be called through tcp_release_cb() if socket was owned by user
 358  * at the time tcp_v4_err() was called to handle ICMP message.
 359  */
 360 void tcp_v4_mtu_reduced(struct sock *sk)
 361 {
 362         struct inet_sock *inet = inet_sk(sk);
 363         struct dst_entry *dst;
 364         u32 mtu;
 365
 366         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 367                 return;
 368         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 369         dst = inet_csk_update_pmtu(sk, mtu);
 370         if (!dst)
 371                 return;
 372
 373         /* Something is about to be wrong... Remember soft error
 374          * for the case, if this connection will not able to recover.
 375          */
 376         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 377                 sk->sk_err_soft = EMSGSIZE;
 378
 379         mtu = dst_mtu(dst);
 380
 381         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 382             ip_sk_accept_pmtu(sk) &&
 383             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 384                 tcp_sync_mss(sk, mtu);
 385
 386                 /* Resend the TCP packet because it's
 387                  * clear that the old packet has been
 388                  * dropped. This is the new "fast" path mtu
 389                  * discovery.
 390                  */
 391                 tcp_simple_retransmit(sk);
 392         } /* else let the usual retransmit timer handle it */
 393 }
 394 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 395
 396 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 397 {
 398         struct dst_entry *dst = __sk_dst_check(sk, 0);
 399
 400         if (dst)
 401                 dst->ops->redirect(dst, sk, skb);
 402 }
 403
 404
 405 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 406 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 407 {
 408         struct request_sock *req = inet_reqsk(sk);
 409         struct net *net = sock_net(sk);
 410
 411         /* ICMPs are not backlogged, hence we cannot get
 412          * an established socket here.
 413          */
 414         if (seq != tcp_rsk(req)->snt_isn) {
 415                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 416         } else if (abort) {
 417                 /*
 418                  * Still in SYN_RECV, just remove it silently.
 419                  * There is no good way to pass the error to the newly
 420                  * created socket, and POSIX does not want network
 421                  * errors returned from accept().
 422                  */
 423                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 424                 tcp_listendrop(req->rsk_listener);
 425         }
 426         reqsk_put(req);
 427 }
 428 EXPORT_SYMBOL(tcp_req_err);
 429
 430 /* TCP-LD (RFC 6069) logic */
 431 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 432 {
 433         struct inet_connection_sock *icsk = inet_csk(sk);
 434         struct tcp_sock *tp = tcp_sk(sk);
 435         struct sk_buff *skb;
 436         s32 remaining;
 437         u32 delta_us;
 438
 439         if (sock_owned_by_user(sk))
 440                 return;
 441
 442         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 443             !icsk->icsk_backoff)
 444                 return;
 445
 446         skb = tcp_rtx_queue_head(sk);
 447         if (WARN_ON_ONCE(!skb))
 448                 return;
 449
 450         icsk->icsk_backoff--;
 451         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 452         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 453
 454         tcp_mstamp_refresh(tp);
 455         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 456         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 457
 458         if (remaining > 0) {
 459                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 460                                           remaining, TCP_RTO_MAX);
 461         } else {
 462                 /* RTO revert clocked out retransmission.
 463                  * Will retransmit now.
 464                  */
 465                 tcp_retransmit_timer(sk);
 466         }
 467 }
 468 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 469
 470 /*
 471  * This routine is called by the ICMP module when it gets some
 472  * sort of error condition.  If err < 0 then the socket should
 473  * be closed and the error returned to the user.  If err > 0
 474  * it's just the icmp type << 8 | icmp code.  After adjustment
 475  * header points to the first 8 bytes of the tcp header.  We need
 476  * to find the appropriate port.
 477  *
 478  * The locking strategy used here is very "optimistic". When
 479  * someone else accesses the socket the ICMP is just dropped
 480  * and for some paths there is no check at all.
 481  * A more general error queue to queue errors for later handling
 482  * is probably better.
 483  *
 484  */
 485
 486 int tcp_v4_err(struct sk_buff *skb, u32 info)
 487 {
 488         const struct iphdr *iph = (const struct iphdr *)skb->data;
 489         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 490         struct tcp_sock *tp;
 491         struct inet_sock *inet;
 492         const int type = icmp_hdr(skb)->type;
 493         const int code = icmp_hdr(skb)->code;
 494         struct sock *sk;
 495         struct request_sock *fastopen;
 496         u32 seq, snd_una;
 497         int err;
 498         struct net *net = dev_net(skb->dev);
 499
 500         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
 501                                        iph->daddr, th->dest, iph->saddr,
 502                                        ntohs(th->source), inet_iif(skb), 0);
 503         if (!sk) {
 504                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 505                 return -ENOENT;
 506         }
 507         if (sk->sk_state == TCP_TIME_WAIT) {
 508                 inet_twsk_put(inet_twsk(sk));
 509                 return 0;
 510         }
 511         seq = ntohl(th->seq);
 512         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 513                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 514                                      type == ICMP_TIME_EXCEEDED ||
 515                                      (type == ICMP_DEST_UNREACH &&
 516                                       (code == ICMP_NET_UNREACH ||
 517                                        code == ICMP_HOST_UNREACH)));
 518                 return 0;
 519         }
 520
 521         bh_lock_sock(sk);
 522         /* If too many ICMPs get dropped on busy
 523          * servers this needs to be solved differently.
 524          * We do take care of PMTU discovery (RFC1191) special case :
 525          * we can receive locally generated ICMP messages while socket is held.
 526          */
 527         if (sock_owned_by_user(sk)) {
 528                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 529                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 530         }
 531         if (sk->sk_state == TCP_CLOSE)
 532                 goto out;
 533
 534         if (static_branch_unlikely(&ip4_min_ttl)) {
 535                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 536                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 537                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 538                         goto out;
 539                 }
 540         }
 541
 542         tp = tcp_sk(sk);
 543         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 544         fastopen = rcu_dereference(tp->fastopen_rsk);
 545         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 546         if (sk->sk_state != TCP_LISTEN &&
 547             !between(seq, snd_una, tp->snd_nxt)) {
 548                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 549                 goto out;
 550         }
 551
 552         switch (type) {
 553         case ICMP_REDIRECT:
 554                 if (!sock_owned_by_user(sk))
 555                         do_redirect(skb, sk);
 556                 goto out;
 557         case ICMP_SOURCE_QUENCH:
 558                 /* Just silently ignore these. */
 559                 goto out;
 560         case ICMP_PARAMETERPROB:
 561                 err = EPROTO;
 562                 break;
 563         case ICMP_DEST_UNREACH:
 564                 if (code > NR_ICMP_UNREACH)
 565                         goto out;
 566
 567                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 568                         /* We are not interested in TCP_LISTEN and open_requests
 569                          * (SYN-ACKs send out by Linux are always <576bytes so
 570                          * they should go through unfragmented).
 571                          */
 572                         if (sk->sk_state == TCP_LISTEN)
 573                                 goto out;
 574
 575                         WRITE_ONCE(tp->mtu_info, info);
 576                         if (!sock_owned_by_user(sk)) {
 577                                 tcp_v4_mtu_reduced(sk);
 578                         } else {
 579                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 580                                         sock_hold(sk);
 581                         }
 582                         goto out;
 583                 }
 584
 585                 err = icmp_err_convert[code].errno;
 586                 /* check if this ICMP message allows revert of backoff.
 587                  * (see RFC 6069)
 588                  */
 589                 if (!fastopen &&
 590                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 591                         tcp_ld_RTO_revert(sk, seq);
 592                 break;
 593         case ICMP_TIME_EXCEEDED:
 594                 err = EHOSTUNREACH;
 595                 break;
 596         default:
 597                 goto out;
 598         }
 599
 600         switch (sk->sk_state) {
 601         case TCP_SYN_SENT:
 602         case TCP_SYN_RECV:
 603                 /* Only in fast or simultaneous open. If a fast open socket is
 604                  * already accepted it is treated as a connected one below.
 605                  */
 606                 if (fastopen && !fastopen->sk)
 607                         break;
 608
 609                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 610
 611                 if (!sock_owned_by_user(sk)) {
 612                         sk->sk_err = err;
 613
 614                         sk_error_report(sk);
 615
 616                         tcp_done(sk);
 617                 } else {
 618                         sk->sk_err_soft = err;
 619                 }
 620                 goto out;
 621         }
 622
 623         /* If we've already connected we will keep trying
 624          * until we time out, or the user gives up.
 625          *
 626          * rfc1122 4.2.3.9 allows to consider as hard errors
 627          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 628          * but it is obsoleted by pmtu discovery).
 629          *
 630          * Note, that in modern internet, where routing is unreliable
 631          * and in each dark corner broken firewalls sit, sending random
 632          * errors ordered by their masters even this two messages finally lose
 633          * their original sense (even Linux sends invalid PORT_UNREACHs)
 634          *
 635          * Now we are in compliance with RFCs.
 636          *                                                      --ANK (980905)
 637          */
 638
 639         inet = inet_sk(sk);
 640         if (!sock_owned_by_user(sk) && inet->recverr) {
 641                 sk->sk_err = err;
 642                 sk_error_report(sk);
 643         } else  { /* Only an error on timeout */
 644                 sk->sk_err_soft = err;
 645         }
 646
 647 out:
 648         bh_unlock_sock(sk);
 649         sock_put(sk);
 650         return 0;
 651 }
 652
 653 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 654 {
 655         struct tcphdr *th = tcp_hdr(skb);
 656
 657         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 658         skb->csum_start = skb_transport_header(skb) - skb->head;
 659         skb->csum_offset = offsetof(struct tcphdr, check);
 660 }
 661
 662 /* This routine computes an IPv4 TCP checksum. */
 663 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 664 {
 665         const struct inet_sock *inet = inet_sk(sk);
 666
 667         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 668 }
 669 EXPORT_SYMBOL(tcp_v4_send_check);
 670
 671 /*
 672  *      This routine will send an RST to the other tcp.
 673  *
 674  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 675  *                    for reset.
 676  *      Answer: if a packet caused RST, it is not for a socket
 677  *              existing in our system, if it is matched to a socket,
 678  *              it is just duplicate segment or bug in other side's TCP.
 679  *              So that we build reply only basing on parameters
 680  *              arrived with segment.
 681  *      Exception: precedence violation. We do not implement it in any case.
 682  */
 683
 684 #ifdef CONFIG_TCP_MD5SIG
 685 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 686 #else
 687 #define OPTION_BYTES sizeof(__be32)
 688 #endif
 689
 690 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 691 {
 692         const struct tcphdr *th = tcp_hdr(skb);
 693         struct {
 694                 struct tcphdr th;
 695                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 696         } rep;
 697         struct ip_reply_arg arg;
 698 #ifdef CONFIG_TCP_MD5SIG
 699         struct tcp_md5sig_key *key = NULL;
 700         const __u8 *hash_location = NULL;
 701         unsigned char newhash[16];
 702         int genhash;
 703         struct sock *sk1 = NULL;
 704 #endif
 705         u64 transmit_time = 0;
 706         struct sock *ctl_sk;
 707         struct net *net;
 708
 709         /* Never send a reset in response to a reset. */
 710         if (th->rst)
 711                 return;
 712
 713         /* If sk not NULL, it means we did a successful lookup and incoming
 714          * route had to be correct. prequeue might have dropped our dst.
 715          */
 716         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 717                 return;
 718
 719         /* Swap the send and the receive. */
 720         memset(&rep, 0, sizeof(rep));
 721         rep.th.dest   = th->source;
 722         rep.th.source = th->dest;
 723         rep.th.doff   = sizeof(struct tcphdr) / 4;
 724         rep.th.rst    = 1;
 725
 726         if (th->ack) {
 727                 rep.th.seq = th->ack_seq;
 728         } else {
 729                 rep.th.ack = 1;
 730                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 731                                        skb->len - (th->doff << 2));
 732         }
 733
 734         memset(&arg, 0, sizeof(arg));
 735         arg.iov[0].iov_base = (unsigned char *)&rep;
 736         arg.iov[0].iov_len  = sizeof(rep.th);
 737
 738         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 739 #ifdef CONFIG_TCP_MD5SIG
 740         rcu_read_lock();
 741         hash_location = tcp_parse_md5sig_option(th);
 742         if (sk && sk_fullsock(sk)) {
 743                 const union tcp_md5_addr *addr;
 744                 int l3index;
 745
 746                 /* sdif set, means packet ingressed via a device
 747                  * in an L3 domain and inet_iif is set to it.
 748                  */
 749                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 750                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 751                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 752         } else if (hash_location) {
 753                 const union tcp_md5_addr *addr;
 754                 int sdif = tcp_v4_sdif(skb);
 755                 int dif = inet_iif(skb);
 756                 int l3index;
 757
 758                 /*
 759                  * active side is lost. Try to find listening socket through
 760                  * source port, and then find md5 key through listening socket.
 761                  * we are not loose security here:
 762                  * Incoming packet is checked with md5 hash with finding key,
 763                  * no RST generated if md5 hash doesn't match.
 764                  */
 765                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
 766                                              NULL, 0, ip_hdr(skb)->saddr,
 767                                              th->source, ip_hdr(skb)->daddr,
 768                                              ntohs(th->source), dif, sdif);
 769                 /* don't send rst if it can't find key */
 770                 if (!sk1)
 771                         goto out;
 772
 773                 /* sdif set, means packet ingressed via a device
 774                  * in an L3 domain and dif is set to it.
 775                  */
 776                 l3index = sdif ? dif : 0;
 777                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 778                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 779                 if (!key)
 780                         goto out;
 781
 782
 783                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 784                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 785                         goto out;
 786
 787         }
 788
 789         if (key) {
 790                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 791                                    (TCPOPT_NOP << 16) |
 792                                    (TCPOPT_MD5SIG << 8) |
 793                                    TCPOLEN_MD5SIG);
 794                 /* Update length and the length the header thinks exists */
 795                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 796                 rep.th.doff = arg.iov[0].iov_len / 4;
 797
 798                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 799                                      key, ip_hdr(skb)->saddr,
 800                                      ip_hdr(skb)->daddr, &rep.th);
 801         }
 802 #endif
 803         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 804         if (rep.opt[0] == 0) {
 805                 __be32 mrst = mptcp_reset_option(skb);
 806
 807                 if (mrst) {
 808                         rep.opt[0] = mrst;
 809                         arg.iov[0].iov_len += sizeof(mrst);
 810                         rep.th.doff = arg.iov[0].iov_len / 4;
 811                 }
 812         }
 813
 814         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 815                                       ip_hdr(skb)->saddr, /* XXX */
 816                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 817         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 818         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 819
 820         /* When socket is gone, all binding information is lost.
 821          * routing might fail in this case. No choice here, if we choose to force
 822          * input interface, we will misroute in case of asymmetric route.
 823          */
 824         if (sk) {
 825                 arg.bound_dev_if = sk->sk_bound_dev_if;
 826                 if (sk_fullsock(sk))
 827                         trace_tcp_send_reset(sk, skb);
 828         }
 829
 830         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 831                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 832
 833         arg.tos = ip_hdr(skb)->tos;
 834         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 835         local_bh_disable();
 836         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 837         sock_net_set(ctl_sk, net);
 838         if (sk) {
 839                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 840                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 841                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 842                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 843                 transmit_time = tcp_transmit_time(sk);
 844                 xfrm_sk_clone_policy(ctl_sk, sk);
 845         }
 846         ip_send_unicast_reply(ctl_sk,
 847                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 848                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 849                               &arg, arg.iov[0].iov_len,
 850                               transmit_time);
 851
 852         ctl_sk->sk_mark = 0;
 853         xfrm_sk_free_policy(ctl_sk);
 854         sock_net_set(ctl_sk, &init_net);
 855         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 856         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 857         local_bh_enable();
 858
 859 #ifdef CONFIG_TCP_MD5SIG
 860 out:
 861         rcu_read_unlock();
 862 #endif
 863 }
 864
 865 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 866    outside socket context is ugly, certainly. What can I do?
 867  */
 868
 869 static void tcp_v4_send_ack(const struct sock *sk,
 870                             struct sk_buff *skb, u32 seq, u32 ack,
 871                             u32 win, u32 tsval, u32 tsecr, int oif,
 872                             struct tcp_md5sig_key *key,
 873                             int reply_flags, u8 tos)
 874 {
 875         const struct tcphdr *th = tcp_hdr(skb);
 876         struct {
 877                 struct tcphdr th;
 878                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 879 #ifdef CONFIG_TCP_MD5SIG
 880                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 881 #endif
 882                         ];
 883         } rep;
 884         struct net *net = sock_net(sk);
 885         struct ip_reply_arg arg;
 886         struct sock *ctl_sk;
 887         u64 transmit_time;
 888
 889         memset(&rep.th, 0, sizeof(struct tcphdr));
 890         memset(&arg, 0, sizeof(arg));
 891
 892         arg.iov[0].iov_base = (unsigned char *)&rep;
 893         arg.iov[0].iov_len  = sizeof(rep.th);
 894         if (tsecr) {
 895                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 896                                    (TCPOPT_TIMESTAMP << 8) |
 897                                    TCPOLEN_TIMESTAMP);
 898                 rep.opt[1] = htonl(tsval);
 899                 rep.opt[2] = htonl(tsecr);
 900                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 901         }
 902
 903         /* Swap the send and the receive. */
 904         rep.th.dest    = th->source;
 905         rep.th.source  = th->dest;
 906         rep.th.doff    = arg.iov[0].iov_len / 4;
 907         rep.th.seq     = htonl(seq);
 908         rep.th.ack_seq = htonl(ack);
 909         rep.th.ack     = 1;
 910         rep.th.window  = htons(win);
 911
 912 #ifdef CONFIG_TCP_MD5SIG
 913         if (key) {
 914                 int offset = (tsecr) ? 3 : 0;
 915
 916                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 917                                           (TCPOPT_NOP << 16) |
 918                                           (TCPOPT_MD5SIG << 8) |
 919                                           TCPOLEN_MD5SIG);
 920                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 921                 rep.th.doff = arg.iov[0].iov_len/4;
 922
 923                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 924                                     key, ip_hdr(skb)->saddr,
 925                                     ip_hdr(skb)->daddr, &rep.th);
 926         }
 927 #endif
 928         arg.flags = reply_flags;
 929         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 930                                       ip_hdr(skb)->saddr, /* XXX */
 931                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 932         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 933         if (oif)
 934                 arg.bound_dev_if = oif;
 935         arg.tos = tos;
 936         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 937         local_bh_disable();
 938         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 939         sock_net_set(ctl_sk, net);
 940         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 941                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 942         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 943                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 944         transmit_time = tcp_transmit_time(sk);
 945         ip_send_unicast_reply(ctl_sk,
 946                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 947                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 948                               &arg, arg.iov[0].iov_len,
 949                               transmit_time);
 950
 951         ctl_sk->sk_mark = 0;
 952         sock_net_set(ctl_sk, &init_net);
 953         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 954         local_bh_enable();
 955 }
 956
 957 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 958 {
 959         struct inet_timewait_sock *tw = inet_twsk(sk);
 960         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 961
 962         tcp_v4_send_ack(sk, skb,
 963                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 964                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 965                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 966                         tcptw->tw_ts_recent,
 967                         tw->tw_bound_dev_if,
 968                         tcp_twsk_md5_key(tcptw),
 969                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 970                         tw->tw_tos
 971                         );
 972
 973         inet_twsk_put(tw);
 974 }
 975
 976 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 977                                   struct request_sock *req)
 978 {
 979         const union tcp_md5_addr *addr;
 980         int l3index;
 981
 982         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 983          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 984          */
 985         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 986                                              tcp_sk(sk)->snd_nxt;
 987
 988         /* RFC 7323 2.3
 989          * The window field (SEG.WND) of every outgoing segment, with the
 990          * exception of <SYN> segments, MUST be right-shifted by
 991          * Rcv.Wind.Shift bits:
 992          */
 993         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 994         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 995         tcp_v4_send_ack(sk, skb, seq,
 996                         tcp_rsk(req)->rcv_nxt,
 997                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 998                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 999                         req->ts_recent,
1000                         0,
1001                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
1002                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1003                         ip_hdr(skb)->tos);
1004 }
1005
1006 /*
1007  *      Send a SYN-ACK after having received a SYN.
1008  *      This still operates on a request_sock only, not on a big
1009  *      socket.
1010  */
1011 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1012                               struct flowi *fl,
1013                               struct request_sock *req,
1014                               struct tcp_fastopen_cookie *foc,
1015                               enum tcp_synack_type synack_type,
1016                               struct sk_buff *syn_skb)
1017 {
1018         const struct inet_request_sock *ireq = inet_rsk(req);
1019         struct flowi4 fl4;
1020         int err = -1;
1021         struct sk_buff *skb;
1022         u8 tos;
1023
1024         /* First, grab a route. */
1025         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1026                 return -1;
1027
1028         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1029
1030         if (skb) {
1031                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1032
1033                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1034                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1035                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1036                                 inet_sk(sk)->tos;
1037
1038                 if (!INET_ECN_is_capable(tos) &&
1039                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1040                         tos |= INET_ECN_ECT_0;
1041
1042                 rcu_read_lock();
1043                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1044                                             ireq->ir_rmt_addr,
1045                                             rcu_dereference(ireq->ireq_opt),
1046                                             tos);
1047                 rcu_read_unlock();
1048                 err = net_xmit_eval(err);
1049         }
1050
1051         return err;
1052 }
1053
1054 /*
1055  *      IPv4 request_sock destructor.
1056  */
1057 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1058 {
1059         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1060 }
1061
1062 #ifdef CONFIG_TCP_MD5SIG
1063 /*
1064  * RFC2385 MD5 checksumming requires a mapping of
1065  * IP address->MD5 Key.
1066  * We need to maintain these in the sk structure.
1067  */
1068
1069 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1070 EXPORT_SYMBOL(tcp_md5_needed);
1071
1072 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1073 {
1074         if (!old)
1075                 return true;
1076
1077         /* l3index always overrides non-l3index */
1078         if (old->l3index && new->l3index == 0)
1079                 return false;
1080         if (old->l3index == 0 && new->l3index)
1081                 return true;
1082
1083         return old->prefixlen < new->prefixlen;
1084 }
1085
1086 /* Find the Key structure for an address.  */
1087 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1088                                            const union tcp_md5_addr *addr,
1089                                            int family)
1090 {
1091         const struct tcp_sock *tp = tcp_sk(sk);
1092         struct tcp_md5sig_key *key;
1093         const struct tcp_md5sig_info *md5sig;
1094         __be32 mask;
1095         struct tcp_md5sig_key *best_match = NULL;
1096         bool match;
1097
1098         /* caller either holds rcu_read_lock() or socket lock */
1099         md5sig = rcu_dereference_check(tp->md5sig_info,
1100                                        lockdep_sock_is_held(sk));
1101         if (!md5sig)
1102                 return NULL;
1103
1104         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105                                  lockdep_sock_is_held(sk)) {
1106                 if (key->family != family)
1107                         continue;
1108                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1109                         continue;
1110                 if (family == AF_INET) {
1111                         mask = inet_make_mask(key->prefixlen);
1112                         match = (key->addr.a4.s_addr & mask) ==
1113                                 (addr->a4.s_addr & mask);
1114 #if IS_ENABLED(CONFIG_IPV6)
1115                 } else if (family == AF_INET6) {
1116                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1117                                                   key->prefixlen);
1118 #endif
1119                 } else {
1120                         match = false;
1121                 }
1122
1123                 if (match && better_md5_match(best_match, key))
1124                         best_match = key;
1125         }
1126         return best_match;
1127 }
1128 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1129
1130 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1131                                                       const union tcp_md5_addr *addr,
1132                                                       int family, u8 prefixlen,
1133                                                       int l3index, u8 flags)
1134 {
1135         const struct tcp_sock *tp = tcp_sk(sk);
1136         struct tcp_md5sig_key *key;
1137         unsigned int size = sizeof(struct in_addr);
1138         const struct tcp_md5sig_info *md5sig;
1139
1140         /* caller either holds rcu_read_lock() or socket lock */
1141         md5sig = rcu_dereference_check(tp->md5sig_info,
1142                                        lockdep_sock_is_held(sk));
1143         if (!md5sig)
1144                 return NULL;
1145 #if IS_ENABLED(CONFIG_IPV6)
1146         if (family == AF_INET6)
1147                 size = sizeof(struct in6_addr);
1148 #endif
1149         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1150                                  lockdep_sock_is_held(sk)) {
1151                 if (key->family != family)
1152                         continue;
1153                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1154                         continue;
1155                 if (key->l3index != l3index)
1156                         continue;
1157                 if (!memcmp(&key->addr, addr, size) &&
1158                     key->prefixlen == prefixlen)
1159                         return key;
1160         }
1161         return NULL;
1162 }
1163
1164 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1165                                          const struct sock *addr_sk)
1166 {
1167         const union tcp_md5_addr *addr;
1168         int l3index;
1169
1170         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1171                                                  addr_sk->sk_bound_dev_if);
1172         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1173         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1174 }
1175 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1176
1177 /* This can be called on a newly created socket, from other files */
1178 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1179                    int family, u8 prefixlen, int l3index, u8 flags,
1180                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1181 {
1182         /* Add Key to the list */
1183         struct tcp_md5sig_key *key;
1184         struct tcp_sock *tp = tcp_sk(sk);
1185         struct tcp_md5sig_info *md5sig;
1186
1187         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1188         if (key) {
1189                 /* Pre-existing entry - just update that one.
1190                  * Note that the key might be used concurrently.
1191                  * data_race() is telling kcsan that we do not care of
1192                  * key mismatches, since changing MD5 key on live flows
1193                  * can lead to packet drops.
1194                  */
1195                 data_race(memcpy(key->key, newkey, newkeylen));
1196
1197                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1198                  * Also note that a reader could catch new key->keylen value
1199                  * but old key->key[], this is the reason we use __GFP_ZERO
1200                  * at sock_kmalloc() time below these lines.
1201                  */
1202                 WRITE_ONCE(key->keylen, newkeylen);
1203
1204                 return 0;
1205         }
1206
1207         md5sig = rcu_dereference_protected(tp->md5sig_info,
1208                                            lockdep_sock_is_held(sk));
1209         if (!md5sig) {
1210                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1211                 if (!md5sig)
1212                         return -ENOMEM;
1213
1214                 sk_gso_disable(sk);
1215                 INIT_HLIST_HEAD(&md5sig->head);
1216                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1217         }
1218
1219         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1220         if (!key)
1221                 return -ENOMEM;
1222         if (!tcp_alloc_md5sig_pool()) {
1223                 sock_kfree_s(sk, key, sizeof(*key));
1224                 return -ENOMEM;
1225         }
1226
1227         memcpy(key->key, newkey, newkeylen);
1228         key->keylen = newkeylen;
1229         key->family = family;
1230         key->prefixlen = prefixlen;
1231         key->l3index = l3index;
1232         key->flags = flags;
1233         memcpy(&key->addr, addr,
1234                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1235                                                                  sizeof(struct in_addr));
1236         hlist_add_head_rcu(&key->node, &md5sig->head);
1237         return 0;
1238 }
1239 EXPORT_SYMBOL(tcp_md5_do_add);
1240
1241 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1242                    u8 prefixlen, int l3index, u8 flags)
1243 {
1244         struct tcp_md5sig_key *key;
1245
1246         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1247         if (!key)
1248                 return -ENOENT;
1249         hlist_del_rcu(&key->node);
1250         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1251         kfree_rcu(key, rcu);
1252         return 0;
1253 }
1254 EXPORT_SYMBOL(tcp_md5_do_del);
1255
1256 static void tcp_clear_md5_list(struct sock *sk)
1257 {
1258         struct tcp_sock *tp = tcp_sk(sk);
1259         struct tcp_md5sig_key *key;
1260         struct hlist_node *n;
1261         struct tcp_md5sig_info *md5sig;
1262
1263         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1264
1265         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1266                 hlist_del_rcu(&key->node);
1267                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1268                 kfree_rcu(key, rcu);
1269         }
1270 }
1271
1272 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1273                                  sockptr_t optval, int optlen)
1274 {
1275         struct tcp_md5sig cmd;
1276         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1277         const union tcp_md5_addr *addr;
1278         u8 prefixlen = 32;
1279         int l3index = 0;
1280         u8 flags;
1281
1282         if (optlen < sizeof(cmd))
1283                 return -EINVAL;
1284
1285         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1286                 return -EFAULT;
1287
1288         if (sin->sin_family != AF_INET)
1289                 return -EINVAL;
1290
1291         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1292
1293         if (optname == TCP_MD5SIG_EXT &&
1294             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1295                 prefixlen = cmd.tcpm_prefixlen;
1296                 if (prefixlen > 32)
1297                         return -EINVAL;
1298         }
1299
1300         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1301             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1302                 struct net_device *dev;
1303
1304                 rcu_read_lock();
1305                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1306                 if (dev && netif_is_l3_master(dev))
1307                         l3index = dev->ifindex;
1308
1309                 rcu_read_unlock();
1310
1311                 /* ok to reference set/not set outside of rcu;
1312                  * right now device MUST be an L3 master
1313                  */
1314                 if (!dev || !l3index)
1315                         return -EINVAL;
1316         }
1317
1318         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1319
1320         if (!cmd.tcpm_keylen)
1321                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1322
1323         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1324                 return -EINVAL;
1325
1326         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1327                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1328 }
1329
1330 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1331                                    __be32 daddr, __be32 saddr,
1332                                    const struct tcphdr *th, int nbytes)
1333 {
1334         struct tcp4_pseudohdr *bp;
1335         struct scatterlist sg;
1336         struct tcphdr *_th;
1337
1338         bp = hp->scratch;
1339         bp->saddr = saddr;
1340         bp->daddr = daddr;
1341         bp->pad = 0;
1342         bp->protocol = IPPROTO_TCP;
1343         bp->len = cpu_to_be16(nbytes);
1344
1345         _th = (struct tcphdr *)(bp + 1);
1346         memcpy(_th, th, sizeof(*th));
1347         _th->check = 0;
1348
1349         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1350         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1351                                 sizeof(*bp) + sizeof(*th));
1352         return crypto_ahash_update(hp->md5_req);
1353 }
1354
1355 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1356                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1357 {
1358         struct tcp_md5sig_pool *hp;
1359         struct ahash_request *req;
1360
1361         hp = tcp_get_md5sig_pool();
1362         if (!hp)
1363                 goto clear_hash_noput;
1364         req = hp->md5_req;
1365
1366         if (crypto_ahash_init(req))
1367                 goto clear_hash;
1368         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1369                 goto clear_hash;
1370         if (tcp_md5_hash_key(hp, key))
1371                 goto clear_hash;
1372         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1373         if (crypto_ahash_final(req))
1374                 goto clear_hash;
1375
1376         tcp_put_md5sig_pool();
1377         return 0;
1378
1379 clear_hash:
1380         tcp_put_md5sig_pool();
1381 clear_hash_noput:
1382         memset(md5_hash, 0, 16);
1383         return 1;
1384 }
1385
1386 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1387                         const struct sock *sk,
1388                         const struct sk_buff *skb)
1389 {
1390         struct tcp_md5sig_pool *hp;
1391         struct ahash_request *req;
1392         const struct tcphdr *th = tcp_hdr(skb);
1393         __be32 saddr, daddr;
1394
1395         if (sk) { /* valid for establish/request sockets */
1396                 saddr = sk->sk_rcv_saddr;
1397                 daddr = sk->sk_daddr;
1398         } else {
1399                 const struct iphdr *iph = ip_hdr(skb);
1400                 saddr = iph->saddr;
1401                 daddr = iph->daddr;
1402         }
1403
1404         hp = tcp_get_md5sig_pool();
1405         if (!hp)
1406                 goto clear_hash_noput;
1407         req = hp->md5_req;
1408
1409         if (crypto_ahash_init(req))
1410                 goto clear_hash;
1411
1412         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1413                 goto clear_hash;
1414         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1415                 goto clear_hash;
1416         if (tcp_md5_hash_key(hp, key))
1417                 goto clear_hash;
1418         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1419         if (crypto_ahash_final(req))
1420                 goto clear_hash;
1421
1422         tcp_put_md5sig_pool();
1423         return 0;
1424
1425 clear_hash:
1426         tcp_put_md5sig_pool();
1427 clear_hash_noput:
1428         memset(md5_hash, 0, 16);
1429         return 1;
1430 }
1431 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1432
1433 #endif
1434
1435 static void tcp_v4_init_req(struct request_sock *req,
1436                             const struct sock *sk_listener,
1437                             struct sk_buff *skb)
1438 {
1439         struct inet_request_sock *ireq = inet_rsk(req);
1440         struct net *net = sock_net(sk_listener);
1441
1442         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1443         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1444         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1445 }
1446
1447 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1448                                           struct sk_buff *skb,
1449                                           struct flowi *fl,
1450                                           struct request_sock *req)
1451 {
1452         tcp_v4_init_req(req, sk, skb);
1453
1454         if (security_inet_conn_request(sk, skb, req))
1455                 return NULL;
1456
1457         return inet_csk_route_req(sk, &fl->u.ip4, req);
1458 }
1459
1460 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1461         .family         =       PF_INET,
1462         .obj_size       =       sizeof(struct tcp_request_sock),
1463         .rtx_syn_ack    =       tcp_rtx_synack,
1464         .send_ack       =       tcp_v4_reqsk_send_ack,
1465         .destructor     =       tcp_v4_reqsk_destructor,
1466         .send_reset     =       tcp_v4_send_reset,
1467         .syn_ack_timeout =      tcp_syn_ack_timeout,
1468 };
1469
1470 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1471         .mss_clamp      =       TCP_MSS_DEFAULT,
1472 #ifdef CONFIG_TCP_MD5SIG
1473         .req_md5_lookup =       tcp_v4_md5_lookup,
1474         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1475 #endif
1476 #ifdef CONFIG_SYN_COOKIES
1477         .cookie_init_seq =      cookie_v4_init_sequence,
1478 #endif
1479         .route_req      =       tcp_v4_route_req,
1480         .init_seq       =       tcp_v4_init_seq,
1481         .init_ts_off    =       tcp_v4_init_ts_off,
1482         .send_synack    =       tcp_v4_send_synack,
1483 };
1484
1485 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1486 {
1487         /* Never answer to SYNs send to broadcast or multicast */
1488         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1489                 goto drop;
1490
1491         return tcp_conn_request(&tcp_request_sock_ops,
1492                                 &tcp_request_sock_ipv4_ops, sk, skb);
1493
1494 drop:
1495         tcp_listendrop(sk);
1496         return 0;
1497 }
1498 EXPORT_SYMBOL(tcp_v4_conn_request);
1499
1500
1501 /*
1502  * The three way handshake has completed - we got a valid synack -
1503  * now create the new socket.
1504  */
1505 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1506                                   struct request_sock *req,
1507                                   struct dst_entry *dst,
1508                                   struct request_sock *req_unhash,
1509                                   bool *own_req)
1510 {
1511         struct inet_request_sock *ireq;
1512         bool found_dup_sk = false;
1513         struct inet_sock *newinet;
1514         struct tcp_sock *newtp;
1515         struct sock *newsk;
1516 #ifdef CONFIG_TCP_MD5SIG
1517         const union tcp_md5_addr *addr;
1518         struct tcp_md5sig_key *key;
1519         int l3index;
1520 #endif
1521         struct ip_options_rcu *inet_opt;
1522
1523         if (sk_acceptq_is_full(sk))
1524                 goto exit_overflow;
1525
1526         newsk = tcp_create_openreq_child(sk, req, skb);
1527         if (!newsk)
1528                 goto exit_nonewsk;
1529
1530         newsk->sk_gso_type = SKB_GSO_TCPV4;
1531         inet_sk_rx_dst_set(newsk, skb);
1532
1533         newtp                 = tcp_sk(newsk);
1534         newinet               = inet_sk(newsk);
1535         ireq                  = inet_rsk(req);
1536         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1537         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1538         newsk->sk_bound_dev_if = ireq->ir_iif;
1539         newinet->inet_saddr   = ireq->ir_loc_addr;
1540         inet_opt              = rcu_dereference(ireq->ireq_opt);
1541         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1542         newinet->mc_index     = inet_iif(skb);
1543         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1544         newinet->rcv_tos      = ip_hdr(skb)->tos;
1545         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1546         if (inet_opt)
1547                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1548         newinet->inet_id = get_random_u16();
1549
1550         /* Set ToS of the new socket based upon the value of incoming SYN.
1551          * ECT bits are set later in tcp_init_transfer().
1552          */
1553         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1554                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1555
1556         if (!dst) {
1557                 dst = inet_csk_route_child_sock(sk, newsk, req);
1558                 if (!dst)
1559                         goto put_and_exit;
1560         } else {
1561                 /* syncookie case : see end of cookie_v4_check() */
1562         }
1563         sk_setup_caps(newsk, dst);
1564
1565         tcp_ca_openreq_child(newsk, dst);
1566
1567         tcp_sync_mss(newsk, dst_mtu(dst));
1568         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1569
1570         tcp_initialize_rcv_mss(newsk);
1571
1572 #ifdef CONFIG_TCP_MD5SIG
1573         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1574         /* Copy over the MD5 key from the original socket */
1575         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1576         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1577         if (key) {
1578                 /*
1579                  * We're using one, so create a matching key
1580                  * on the newsk structure. If we fail to get
1581                  * memory, then we end up not copying the key
1582                  * across. Shucks.
1583                  */
1584                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1585                                key->key, key->keylen, GFP_ATOMIC);
1586                 sk_gso_disable(newsk);
1587         }
1588 #endif
1589
1590         if (__inet_inherit_port(sk, newsk) < 0)
1591                 goto put_and_exit;
1592         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1593                                        &found_dup_sk);
1594         if (likely(*own_req)) {
1595                 tcp_move_syn(newtp, req);
1596                 ireq->ireq_opt = NULL;
1597         } else {
1598                 newinet->inet_opt = NULL;
1599
1600                 if (!req_unhash && found_dup_sk) {
1601                         /* This code path should only be executed in the
1602                          * syncookie case only
1603                          */
1604                         bh_unlock_sock(newsk);
1605                         sock_put(newsk);
1606                         newsk = NULL;
1607                 }
1608         }
1609         return newsk;
1610
1611 exit_overflow:
1612         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1613 exit_nonewsk:
1614         dst_release(dst);
1615 exit:
1616         tcp_listendrop(sk);
1617         return NULL;
1618 put_and_exit:
1619         newinet->inet_opt = NULL;
1620         inet_csk_prepare_forced_close(newsk);
1621         tcp_done(newsk);
1622         goto exit;
1623 }
1624 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1625
1626 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1627 {
1628 #ifdef CONFIG_SYN_COOKIES
1629         const struct tcphdr *th = tcp_hdr(skb);
1630
1631         if (!th->syn)
1632                 sk = cookie_v4_check(sk, skb);
1633 #endif
1634         return sk;
1635 }
1636
1637 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1638                          struct tcphdr *th, u32 *cookie)
1639 {
1640         u16 mss = 0;
1641 #ifdef CONFIG_SYN_COOKIES
1642         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1643                                     &tcp_request_sock_ipv4_ops, sk, th);
1644         if (mss) {
1645                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1646                 tcp_synq_overflow(sk);
1647         }
1648 #endif
1649         return mss;
1650 }
1651
1652 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1653                                                            u32));
1654 /* The socket must have it's spinlock held when we get
1655  * here, unless it is a TCP_LISTEN socket.
1656  *
1657  * We have a potential double-lock case here, so even when
1658  * doing backlog processing we use the BH locking scheme.
1659  * This is because we cannot sleep with the original spinlock
1660  * held.
1661  */
1662 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1663 {
1664         enum skb_drop_reason reason;
1665         struct sock *rsk;
1666
1667         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1668                 struct dst_entry *dst;
1669
1670                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1671                                                 lockdep_sock_is_held(sk));
1672
1673                 sock_rps_save_rxhash(sk, skb);
1674                 sk_mark_napi_id(sk, skb);
1675                 if (dst) {
1676                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1677                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1678                                              dst, 0)) {
1679                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1680                                 dst_release(dst);
1681                         }
1682                 }
1683                 tcp_rcv_established(sk, skb);
1684                 return 0;
1685         }
1686
1687         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1688         if (tcp_checksum_complete(skb))
1689                 goto csum_err;
1690
1691         if (sk->sk_state == TCP_LISTEN) {
1692                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1693
1694                 if (!nsk)
1695                         goto discard;
1696                 if (nsk != sk) {
1697                         if (tcp_child_process(sk, nsk, skb)) {
1698                                 rsk = nsk;
1699                                 goto reset;
1700                         }
1701                         return 0;
1702                 }
1703         } else
1704                 sock_rps_save_rxhash(sk, skb);
1705
1706         if (tcp_rcv_state_process(sk, skb)) {
1707                 rsk = sk;
1708                 goto reset;
1709         }
1710         return 0;
1711
1712 reset:
1713         tcp_v4_send_reset(rsk, skb);
1714 discard:
1715         kfree_skb_reason(skb, reason);
1716         /* Be careful here. If this function gets more complicated and
1717          * gcc suffers from register pressure on the x86, sk (in %ebx)
1718          * might be destroyed here. This current version compiles correctly,
1719          * but you have been warned.
1720          */
1721         return 0;
1722
1723 csum_err:
1724         reason = SKB_DROP_REASON_TCP_CSUM;
1725         trace_tcp_bad_csum(skb);
1726         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1727         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1728         goto discard;
1729 }
1730 EXPORT_SYMBOL(tcp_v4_do_rcv);
1731
1732 int tcp_v4_early_demux(struct sk_buff *skb)
1733 {
1734         struct net *net = dev_net(skb->dev);
1735         const struct iphdr *iph;
1736         const struct tcphdr *th;
1737         struct sock *sk;
1738
1739         if (skb->pkt_type != PACKET_HOST)
1740                 return 0;
1741
1742         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1743                 return 0;
1744
1745         iph = ip_hdr(skb);
1746         th = tcp_hdr(skb);
1747
1748         if (th->doff < sizeof(struct tcphdr) / 4)
1749                 return 0;
1750
1751         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1752                                        iph->saddr, th->source,
1753                                        iph->daddr, ntohs(th->dest),
1754                                        skb->skb_iif, inet_sdif(skb));
1755         if (sk) {
1756                 skb->sk = sk;
1757                 skb->destructor = sock_edemux;
1758                 if (sk_fullsock(sk)) {
1759                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1760
1761                         if (dst)
1762                                 dst = dst_check(dst, 0);
1763                         if (dst &&
1764                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1765                                 skb_dst_set_noref(skb, dst);
1766                 }
1767         }
1768         return 0;
1769 }
1770
1771 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1772                      enum skb_drop_reason *reason)
1773 {
1774         u32 limit, tail_gso_size, tail_gso_segs;
1775         struct skb_shared_info *shinfo;
1776         const struct tcphdr *th;
1777         struct tcphdr *thtail;
1778         struct sk_buff *tail;
1779         unsigned int hdrlen;
1780         bool fragstolen;
1781         u32 gso_segs;
1782         u32 gso_size;
1783         int delta;
1784
1785         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1786          * we can fix skb->truesize to its real value to avoid future drops.
1787          * This is valid because skb is not yet charged to the socket.
1788          * It has been noticed pure SACK packets were sometimes dropped
1789          * (if cooked by drivers without copybreak feature).
1790          */
1791         skb_condense(skb);
1792
1793         skb_dst_drop(skb);
1794
1795         if (unlikely(tcp_checksum_complete(skb))) {
1796                 bh_unlock_sock(sk);
1797                 trace_tcp_bad_csum(skb);
1798                 *reason = SKB_DROP_REASON_TCP_CSUM;
1799                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1800                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1801                 return true;
1802         }
1803
1804         /* Attempt coalescing to last skb in backlog, even if we are
1805          * above the limits.
1806          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1807          */
1808         th = (const struct tcphdr *)skb->data;
1809         hdrlen = th->doff * 4;
1810
1811         tail = sk->sk_backlog.tail;
1812         if (!tail)
1813                 goto no_coalesce;
1814         thtail = (struct tcphdr *)tail->data;
1815
1816         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1817             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1818             ((TCP_SKB_CB(tail)->tcp_flags |
1819               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1820             !((TCP_SKB_CB(tail)->tcp_flags &
1821               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1822             ((TCP_SKB_CB(tail)->tcp_flags ^
1823               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1824 #ifdef CONFIG_TLS_DEVICE
1825             tail->decrypted != skb->decrypted ||
1826 #endif
1827             thtail->doff != th->doff ||
1828             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1829                 goto no_coalesce;
1830
1831         __skb_pull(skb, hdrlen);
1832
1833         shinfo = skb_shinfo(skb);
1834         gso_size = shinfo->gso_size ?: skb->len;
1835         gso_segs = shinfo->gso_segs ?: 1;
1836
1837         shinfo = skb_shinfo(tail);
1838         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1839         tail_gso_segs = shinfo->gso_segs ?: 1;
1840
1841         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1842                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1843
1844                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1845                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1846                         thtail->window = th->window;
1847                 }
1848
1849                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1850                  * thtail->fin, so that the fast path in tcp_rcv_established()
1851                  * is not entered if we append a packet with a FIN.
1852                  * SYN, RST, URG are not present.
1853                  * ACK is set on both packets.
1854                  * PSH : we do not really care in TCP stack,
1855                  *       at least for 'GRO' packets.
1856                  */
1857                 thtail->fin |= th->fin;
1858                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1859
1860                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1861                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1862                         tail->tstamp = skb->tstamp;
1863                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1864                 }
1865
1866                 /* Not as strict as GRO. We only need to carry mss max value */
1867                 shinfo->gso_size = max(gso_size, tail_gso_size);
1868                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1869
1870                 sk->sk_backlog.len += delta;
1871                 __NET_INC_STATS(sock_net(sk),
1872                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1873                 kfree_skb_partial(skb, fragstolen);
1874                 return false;
1875         }
1876         __skb_push(skb, hdrlen);
1877
1878 no_coalesce:
1879         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1880
1881         /* Only socket owner can try to collapse/prune rx queues
1882          * to reduce memory overhead, so add a little headroom here.
1883          * Few sockets backlog are possibly concurrently non empty.
1884          */
1885         limit += 64 * 1024;
1886
1887         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1888                 bh_unlock_sock(sk);
1889                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1890                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1891                 return true;
1892         }
1893         return false;
1894 }
1895 EXPORT_SYMBOL(tcp_add_backlog);
1896
1897 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1898 {
1899         struct tcphdr *th = (struct tcphdr *)skb->data;
1900
1901         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1902 }
1903 EXPORT_SYMBOL(tcp_filter);
1904
1905 static void tcp_v4_restore_cb(struct sk_buff *skb)
1906 {
1907         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1908                 sizeof(struct inet_skb_parm));
1909 }
1910
1911 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1912                            const struct tcphdr *th)
1913 {
1914         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1915          * barrier() makes sure compiler wont play fool^Waliasing games.
1916          */
1917         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1918                 sizeof(struct inet_skb_parm));
1919         barrier();
1920
1921         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1922         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1923                                     skb->len - th->doff * 4);
1924         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1925         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1926         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1927         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1928         TCP_SKB_CB(skb)->sacked  = 0;
1929         TCP_SKB_CB(skb)->has_rxtstamp =
1930                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1931 }
1932
1933 /*
1934  *      From tcp_input.c
1935  */
1936
1937 int tcp_v4_rcv(struct sk_buff *skb)
1938 {
1939         struct net *net = dev_net(skb->dev);
1940         enum skb_drop_reason drop_reason;
1941         int sdif = inet_sdif(skb);
1942         int dif = inet_iif(skb);
1943         const struct iphdr *iph;
1944         const struct tcphdr *th;
1945         bool refcounted;
1946         struct sock *sk;
1947         int ret;
1948
1949         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1950         if (skb->pkt_type != PACKET_HOST)
1951                 goto discard_it;
1952
1953         /* Count it even if it's bad */
1954         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1955
1956         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1957                 goto discard_it;
1958
1959         th = (const struct tcphdr *)skb->data;
1960
1961         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1962                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1963                 goto bad_packet;
1964         }
1965         if (!pskb_may_pull(skb, th->doff * 4))
1966                 goto discard_it;
1967
1968         /* An explanation is required here, I think.
1969          * Packet length and doff are validated by header prediction,
1970          * provided case of th->doff==0 is eliminated.
1971          * So, we defer the checks. */
1972
1973         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1974                 goto csum_error;
1975
1976         th = (const struct tcphdr *)skb->data;
1977         iph = ip_hdr(skb);
1978 lookup:
1979         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1980                                skb, __tcp_hdrlen(th), th->source,
1981                                th->dest, sdif, &refcounted);
1982         if (!sk)
1983                 goto no_tcp_socket;
1984
1985 process:
1986         if (sk->sk_state == TCP_TIME_WAIT)
1987                 goto do_time_wait;
1988
1989         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1990                 struct request_sock *req = inet_reqsk(sk);
1991                 bool req_stolen = false;
1992                 struct sock *nsk;
1993
1994                 sk = req->rsk_listener;
1995                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1996                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1997                 else
1998                         drop_reason = tcp_inbound_md5_hash(sk, skb,
1999                                                    &iph->saddr, &iph->daddr,
2000                                                    AF_INET, dif, sdif);
2001                 if (unlikely(drop_reason)) {
2002                         sk_drops_add(sk, skb);
2003                         reqsk_put(req);
2004                         goto discard_it;
2005                 }
2006                 if (tcp_checksum_complete(skb)) {
2007                         reqsk_put(req);
2008                         goto csum_error;
2009                 }
2010                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2011                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2012                         if (!nsk) {
2013                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2014                                 goto lookup;
2015                         }
2016                         sk = nsk;
2017                         /* reuseport_migrate_sock() has already held one sk_refcnt
2018                          * before returning.
2019                          */
2020                 } else {
2021                         /* We own a reference on the listener, increase it again
2022                          * as we might lose it too soon.
2023                          */
2024                         sock_hold(sk);
2025                 }
2026                 refcounted = true;
2027                 nsk = NULL;
2028                 if (!tcp_filter(sk, skb)) {
2029                         th = (const struct tcphdr *)skb->data;
2030                         iph = ip_hdr(skb);
2031                         tcp_v4_fill_cb(skb, iph, th);
2032                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2033                 } else {
2034                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2035                 }
2036                 if (!nsk) {
2037                         reqsk_put(req);
2038                         if (req_stolen) {
2039                                 /* Another cpu got exclusive access to req
2040                                  * and created a full blown socket.
2041                                  * Try to feed this packet to this socket
2042                                  * instead of discarding it.
2043                                  */
2044                                 tcp_v4_restore_cb(skb);
2045                                 sock_put(sk);
2046                                 goto lookup;
2047                         }
2048                         goto discard_and_relse;
2049                 }
2050                 nf_reset_ct(skb);
2051                 if (nsk == sk) {
2052                         reqsk_put(req);
2053                         tcp_v4_restore_cb(skb);
2054                 } else if (tcp_child_process(sk, nsk, skb)) {
2055                         tcp_v4_send_reset(nsk, skb);
2056                         goto discard_and_relse;
2057                 } else {
2058                         sock_put(sk);
2059                         return 0;
2060                 }
2061         }
2062
2063         if (static_branch_unlikely(&ip4_min_ttl)) {
2064                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2065                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2066                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2067                         goto discard_and_relse;
2068                 }
2069         }
2070
2071         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2072                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2073                 goto discard_and_relse;
2074         }
2075
2076         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2077                                            &iph->daddr, AF_INET, dif, sdif);
2078         if (drop_reason)
2079                 goto discard_and_relse;
2080
2081         nf_reset_ct(skb);
2082
2083         if (tcp_filter(sk, skb)) {
2084                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2085                 goto discard_and_relse;
2086         }
2087         th = (const struct tcphdr *)skb->data;
2088         iph = ip_hdr(skb);
2089         tcp_v4_fill_cb(skb, iph, th);
2090
2091         skb->dev = NULL;
2092
2093         if (sk->sk_state == TCP_LISTEN) {
2094                 ret = tcp_v4_do_rcv(sk, skb);
2095                 goto put_and_return;
2096         }
2097
2098         sk_incoming_cpu_update(sk);
2099
2100         bh_lock_sock_nested(sk);
2101         tcp_segs_in(tcp_sk(sk), skb);
2102         ret = 0;
2103         if (!sock_owned_by_user(sk)) {
2104                 ret = tcp_v4_do_rcv(sk, skb);
2105         } else {
2106                 if (tcp_add_backlog(sk, skb, &drop_reason))
2107                         goto discard_and_relse;
2108         }
2109         bh_unlock_sock(sk);
2110
2111 put_and_return:
2112         if (refcounted)
2113                 sock_put(sk);
2114
2115         return ret;
2116
2117 no_tcp_socket:
2118         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2119         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2120                 goto discard_it;
2121
2122         tcp_v4_fill_cb(skb, iph, th);
2123
2124         if (tcp_checksum_complete(skb)) {
2125 csum_error:
2126                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2127                 trace_tcp_bad_csum(skb);
2128                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2129 bad_packet:
2130                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2131         } else {
2132                 tcp_v4_send_reset(NULL, skb);
2133         }
2134
2135 discard_it:
2136         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2137         /* Discard frame. */
2138         kfree_skb_reason(skb, drop_reason);
2139         return 0;
2140
2141 discard_and_relse:
2142         sk_drops_add(sk, skb);
2143         if (refcounted)
2144                 sock_put(sk);
2145         goto discard_it;
2146
2147 do_time_wait:
2148         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2149                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2150                 inet_twsk_put(inet_twsk(sk));
2151                 goto discard_it;
2152         }
2153
2154         tcp_v4_fill_cb(skb, iph, th);
2155
2156         if (tcp_checksum_complete(skb)) {
2157                 inet_twsk_put(inet_twsk(sk));
2158                 goto csum_error;
2159         }
2160         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2161         case TCP_TW_SYN: {
2162                 struct sock *sk2 = inet_lookup_listener(net,
2163                                                         net->ipv4.tcp_death_row.hashinfo,
2164                                                         skb, __tcp_hdrlen(th),
2165                                                         iph->saddr, th->source,
2166                                                         iph->daddr, th->dest,
2167                                                         inet_iif(skb),
2168                                                         sdif);
2169                 if (sk2) {
2170                         inet_twsk_deschedule_put(inet_twsk(sk));
2171                         sk = sk2;
2172                         tcp_v4_restore_cb(skb);
2173                         refcounted = false;
2174                         goto process;
2175                 }
2176         }
2177                 /* to ACK */
2178                 fallthrough;
2179         case TCP_TW_ACK:
2180                 tcp_v4_timewait_ack(sk, skb);
2181                 break;
2182         case TCP_TW_RST:
2183                 tcp_v4_send_reset(sk, skb);
2184                 inet_twsk_deschedule_put(inet_twsk(sk));
2185                 goto discard_it;
2186         case TCP_TW_SUCCESS:;
2187         }
2188         goto discard_it;
2189 }
2190
2191 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2192         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2193         .twsk_unique    = tcp_twsk_unique,
2194         .twsk_destructor= tcp_twsk_destructor,
2195 };
2196
2197 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2198 {
2199         struct dst_entry *dst = skb_dst(skb);
2200
2201         if (dst && dst_hold_safe(dst)) {
2202                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2203                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2204         }
2205 }
2206 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2207
2208 const struct inet_connection_sock_af_ops ipv4_specific = {
2209         .queue_xmit        = ip_queue_xmit,
2210         .send_check        = tcp_v4_send_check,
2211         .rebuild_header    = inet_sk_rebuild_header,
2212         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2213         .conn_request      = tcp_v4_conn_request,
2214         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2215         .net_header_len    = sizeof(struct iphdr),
2216         .setsockopt        = ip_setsockopt,
2217         .getsockopt        = ip_getsockopt,
2218         .addr2sockaddr     = inet_csk_addr2sockaddr,
2219         .sockaddr_len      = sizeof(struct sockaddr_in),
2220         .mtu_reduced       = tcp_v4_mtu_reduced,
2221 };
2222 EXPORT_SYMBOL(ipv4_specific);
2223
2224 #ifdef CONFIG_TCP_MD5SIG
2225 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2226         .md5_lookup             = tcp_v4_md5_lookup,
2227         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2228         .md5_parse              = tcp_v4_parse_md5_keys,
2229 };
2230 #endif
2231
2232 /* NOTE: A lot of things set to zero explicitly by call to
2233  *       sk_alloc() so need not be done here.
2234  */
2235 static int tcp_v4_init_sock(struct sock *sk)
2236 {
2237         struct inet_connection_sock *icsk = inet_csk(sk);
2238
2239         tcp_init_sock(sk);
2240
2241         icsk->icsk_af_ops = &ipv4_specific;
2242
2243 #ifdef CONFIG_TCP_MD5SIG
2244         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2245 #endif
2246
2247         return 0;
2248 }
2249
2250 void tcp_v4_destroy_sock(struct sock *sk)
2251 {
2252         struct tcp_sock *tp = tcp_sk(sk);
2253
2254         trace_tcp_destroy_sock(sk);
2255
2256         tcp_clear_xmit_timers(sk);
2257
2258         tcp_cleanup_congestion_control(sk);
2259
2260         tcp_cleanup_ulp(sk);
2261
2262         /* Cleanup up the write buffer. */
2263         tcp_write_queue_purge(sk);
2264
2265         /* Check if we want to disable active TFO */
2266         tcp_fastopen_active_disable_ofo_check(sk);
2267
2268         /* Cleans up our, hopefully empty, out_of_order_queue. */
2269         skb_rbtree_purge(&tp->out_of_order_queue);
2270
2271 #ifdef CONFIG_TCP_MD5SIG
2272         /* Clean up the MD5 key list, if any */
2273         if (tp->md5sig_info) {
2274                 tcp_clear_md5_list(sk);
2275                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2276                 tp->md5sig_info = NULL;
2277         }
2278 #endif
2279
2280         /* Clean up a referenced TCP bind bucket. */
2281         if (inet_csk(sk)->icsk_bind_hash)
2282                 inet_put_port(sk);
2283
2284         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2285
2286         /* If socket is aborted during connect operation */
2287         tcp_free_fastopen_req(tp);
2288         tcp_fastopen_destroy_cipher(sk);
2289         tcp_saved_syn_free(tp);
2290
2291         sk_sockets_allocated_dec(sk);
2292 }
2293 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2294
2295 #ifdef CONFIG_PROC_FS
2296 /* Proc filesystem TCP sock list dumping. */
2297
2298 static unsigned short seq_file_family(const struct seq_file *seq);
2299
2300 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2301 {
2302         unsigned short family = seq_file_family(seq);
2303
2304         /* AF_UNSPEC is used as a match all */
2305         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2306                 net_eq(sock_net(sk), seq_file_net(seq)));
2307 }
2308
2309 /* Find a non empty bucket (starting from st->bucket)
2310  * and return the first sk from it.
2311  */
2312 static void *listening_get_first(struct seq_file *seq)
2313 {
2314         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2315         struct tcp_iter_state *st = seq->private;
2316
2317         st->offset = 0;
2318         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2319                 struct inet_listen_hashbucket *ilb2;
2320                 struct hlist_nulls_node *node;
2321                 struct sock *sk;
2322
2323                 ilb2 = &hinfo->lhash2[st->bucket];
2324                 if (hlist_nulls_empty(&ilb2->nulls_head))
2325                         continue;
2326
2327                 spin_lock(&ilb2->lock);
2328                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2329                         if (seq_sk_match(seq, sk))
2330                                 return sk;
2331                 }
2332                 spin_unlock(&ilb2->lock);
2333         }
2334
2335         return NULL;
2336 }
2337
2338 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2339  * If "cur" is the last one in the st->bucket,
2340  * call listening_get_first() to return the first sk of the next
2341  * non empty bucket.
2342  */
2343 static void *listening_get_next(struct seq_file *seq, void *cur)
2344 {
2345         struct tcp_iter_state *st = seq->private;
2346         struct inet_listen_hashbucket *ilb2;
2347         struct hlist_nulls_node *node;
2348         struct inet_hashinfo *hinfo;
2349         struct sock *sk = cur;
2350
2351         ++st->num;
2352         ++st->offset;
2353
2354         sk = sk_nulls_next(sk);
2355         sk_nulls_for_each_from(sk, node) {
2356                 if (seq_sk_match(seq, sk))
2357                         return sk;
2358         }
2359
2360         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2361         ilb2 = &hinfo->lhash2[st->bucket];
2362         spin_unlock(&ilb2->lock);
2363         ++st->bucket;
2364         return listening_get_first(seq);
2365 }
2366
2367 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2368 {
2369         struct tcp_iter_state *st = seq->private;
2370         void *rc;
2371
2372         st->bucket = 0;
2373         st->offset = 0;
2374         rc = listening_get_first(seq);
2375
2376         while (rc && *pos) {
2377                 rc = listening_get_next(seq, rc);
2378                 --*pos;
2379         }
2380         return rc;
2381 }
2382
2383 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2384                                 const struct tcp_iter_state *st)
2385 {
2386         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2387 }
2388
2389 /*
2390  * Get first established socket starting from bucket given in st->bucket.
2391  * If st->bucket is zero, the very first socket in the hash is returned.
2392  */
2393 static void *established_get_first(struct seq_file *seq)
2394 {
2395         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2396         struct tcp_iter_state *st = seq->private;
2397
2398         st->offset = 0;
2399         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2400                 struct sock *sk;
2401                 struct hlist_nulls_node *node;
2402                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2403
2404                 /* Lockless fast path for the common case of empty buckets */
2405                 if (empty_bucket(hinfo, st))
2406                         continue;
2407
2408                 spin_lock_bh(lock);
2409                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2410                         if (seq_sk_match(seq, sk))
2411                                 return sk;
2412                 }
2413                 spin_unlock_bh(lock);
2414         }
2415
2416         return NULL;
2417 }
2418
2419 static void *established_get_next(struct seq_file *seq, void *cur)
2420 {
2421         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2422         struct tcp_iter_state *st = seq->private;
2423         struct hlist_nulls_node *node;
2424         struct sock *sk = cur;
2425
2426         ++st->num;
2427         ++st->offset;
2428
2429         sk = sk_nulls_next(sk);
2430
2431         sk_nulls_for_each_from(sk, node) {
2432                 if (seq_sk_match(seq, sk))
2433                         return sk;
2434         }
2435
2436         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2437         ++st->bucket;
2438         return established_get_first(seq);
2439 }
2440
2441 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2442 {
2443         struct tcp_iter_state *st = seq->private;
2444         void *rc;
2445
2446         st->bucket = 0;
2447         rc = established_get_first(seq);
2448
2449         while (rc && pos) {
2450                 rc = established_get_next(seq, rc);
2451                 --pos;
2452         }
2453         return rc;
2454 }
2455
2456 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2457 {
2458         void *rc;
2459         struct tcp_iter_state *st = seq->private;
2460
2461         st->state = TCP_SEQ_STATE_LISTENING;
2462         rc        = listening_get_idx(seq, &pos);
2463
2464         if (!rc) {
2465                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2466                 rc        = established_get_idx(seq, pos);
2467         }
2468
2469         return rc;
2470 }
2471
2472 static void *tcp_seek_last_pos(struct seq_file *seq)
2473 {
2474         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2475         struct tcp_iter_state *st = seq->private;
2476         int bucket = st->bucket;
2477         int offset = st->offset;
2478         int orig_num = st->num;
2479         void *rc = NULL;
2480
2481         switch (st->state) {
2482         case TCP_SEQ_STATE_LISTENING:
2483                 if (st->bucket > hinfo->lhash2_mask)
2484                         break;
2485                 st->state = TCP_SEQ_STATE_LISTENING;
2486                 rc = listening_get_first(seq);
2487                 while (offset-- && rc && bucket == st->bucket)
2488                         rc = listening_get_next(seq, rc);
2489                 if (rc)
2490                         break;
2491                 st->bucket = 0;
2492                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2493                 fallthrough;
2494         case TCP_SEQ_STATE_ESTABLISHED:
2495                 if (st->bucket > hinfo->ehash_mask)
2496                         break;
2497                 rc = established_get_first(seq);
2498                 while (offset-- && rc && bucket == st->bucket)
2499                         rc = established_get_next(seq, rc);
2500         }
2501
2502         st->num = orig_num;
2503
2504         return rc;
2505 }
2506
2507 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2508 {
2509         struct tcp_iter_state *st = seq->private;
2510         void *rc;
2511
2512         if (*pos && *pos == st->last_pos) {
2513                 rc = tcp_seek_last_pos(seq);
2514                 if (rc)
2515                         goto out;
2516         }
2517
2518         st->state = TCP_SEQ_STATE_LISTENING;
2519         st->num = 0;
2520         st->bucket = 0;
2521         st->offset = 0;
2522         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2523
2524 out:
2525         st->last_pos = *pos;
2526         return rc;
2527 }
2528 EXPORT_SYMBOL(tcp_seq_start);
2529
2530 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2531 {
2532         struct tcp_iter_state *st = seq->private;
2533         void *rc = NULL;
2534
2535         if (v == SEQ_START_TOKEN) {
2536                 rc = tcp_get_idx(seq, 0);
2537                 goto out;
2538         }
2539
2540         switch (st->state) {
2541         case TCP_SEQ_STATE_LISTENING:
2542                 rc = listening_get_next(seq, v);
2543                 if (!rc) {
2544                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2545                         st->bucket = 0;
2546                         st->offset = 0;
2547                         rc        = established_get_first(seq);
2548                 }
2549                 break;
2550         case TCP_SEQ_STATE_ESTABLISHED:
2551                 rc = established_get_next(seq, v);
2552                 break;
2553         }
2554 out:
2555         ++*pos;
2556         st->last_pos = *pos;
2557         return rc;
2558 }
2559 EXPORT_SYMBOL(tcp_seq_next);
2560
2561 void tcp_seq_stop(struct seq_file *seq, void *v)
2562 {
2563         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2564         struct tcp_iter_state *st = seq->private;
2565
2566         switch (st->state) {
2567         case TCP_SEQ_STATE_LISTENING:
2568                 if (v != SEQ_START_TOKEN)
2569                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2570                 break;
2571         case TCP_SEQ_STATE_ESTABLISHED:
2572                 if (v)
2573                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2574                 break;
2575         }
2576 }
2577 EXPORT_SYMBOL(tcp_seq_stop);
2578
2579 static void get_openreq4(const struct request_sock *req,
2580                          struct seq_file *f, int i)
2581 {
2582         const struct inet_request_sock *ireq = inet_rsk(req);
2583         long delta = req->rsk_timer.expires - jiffies;
2584
2585         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2586                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2587                 i,
2588                 ireq->ir_loc_addr,
2589                 ireq->ir_num,
2590                 ireq->ir_rmt_addr,
2591                 ntohs(ireq->ir_rmt_port),
2592                 TCP_SYN_RECV,
2593                 0, 0, /* could print option size, but that is af dependent. */
2594                 1,    /* timers active (only the expire timer) */
2595                 jiffies_delta_to_clock_t(delta),
2596                 req->num_timeout,
2597                 from_kuid_munged(seq_user_ns(f),
2598                                  sock_i_uid(req->rsk_listener)),
2599                 0,  /* non standard timer */
2600                 0, /* open_requests have no inode */
2601                 0,
2602                 req);
2603 }
2604
2605 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2606 {
2607         int timer_active;
2608         unsigned long timer_expires;
2609         const struct tcp_sock *tp = tcp_sk(sk);
2610         const struct inet_connection_sock *icsk = inet_csk(sk);
2611         const struct inet_sock *inet = inet_sk(sk);
2612         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2613         __be32 dest = inet->inet_daddr;
2614         __be32 src = inet->inet_rcv_saddr;
2615         __u16 destp = ntohs(inet->inet_dport);
2616         __u16 srcp = ntohs(inet->inet_sport);
2617         int rx_queue;
2618         int state;
2619
2620         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2621             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2622             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2623                 timer_active    = 1;
2624                 timer_expires   = icsk->icsk_timeout;
2625         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2626                 timer_active    = 4;
2627                 timer_expires   = icsk->icsk_timeout;
2628         } else if (timer_pending(&sk->sk_timer)) {
2629                 timer_active    = 2;
2630                 timer_expires   = sk->sk_timer.expires;
2631         } else {
2632                 timer_active    = 0;
2633                 timer_expires = jiffies;
2634         }
2635
2636         state = inet_sk_state_load(sk);
2637         if (state == TCP_LISTEN)
2638                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2639         else
2640                 /* Because we don't lock the socket,
2641                  * we might find a transient negative value.
2642                  */
2643                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2644                                       READ_ONCE(tp->copied_seq), 0);
2645
2646         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2647                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2648                 i, src, srcp, dest, destp, state,
2649                 READ_ONCE(tp->write_seq) - tp->snd_una,
2650                 rx_queue,
2651                 timer_active,
2652                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2653                 icsk->icsk_retransmits,
2654                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2655                 icsk->icsk_probes_out,
2656                 sock_i_ino(sk),
2657                 refcount_read(&sk->sk_refcnt), sk,
2658                 jiffies_to_clock_t(icsk->icsk_rto),
2659                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2660                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2661                 tcp_snd_cwnd(tp),
2662                 state == TCP_LISTEN ?
2663                     fastopenq->max_qlen :
2664                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2665 }
2666
2667 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2668                                struct seq_file *f, int i)
2669 {
2670         long delta = tw->tw_timer.expires - jiffies;
2671         __be32 dest, src;
2672         __u16 destp, srcp;
2673
2674         dest  = tw->tw_daddr;
2675         src   = tw->tw_rcv_saddr;
2676         destp = ntohs(tw->tw_dport);
2677         srcp  = ntohs(tw->tw_sport);
2678
2679         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2680                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2681                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2682                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2683                 refcount_read(&tw->tw_refcnt), tw);
2684 }
2685
2686 #define TMPSZ 150
2687
2688 static int tcp4_seq_show(struct seq_file *seq, void *v)
2689 {
2690         struct tcp_iter_state *st;
2691         struct sock *sk = v;
2692
2693         seq_setwidth(seq, TMPSZ - 1);
2694         if (v == SEQ_START_TOKEN) {
2695                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2696                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2697                            "inode");
2698                 goto out;
2699         }
2700         st = seq->private;
2701
2702         if (sk->sk_state == TCP_TIME_WAIT)
2703                 get_timewait4_sock(v, seq, st->num);
2704         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2705                 get_openreq4(v, seq, st->num);
2706         else
2707                 get_tcp4_sock(v, seq, st->num);
2708 out:
2709         seq_pad(seq, '\n');
2710         return 0;
2711 }
2712
2713 #ifdef CONFIG_BPF_SYSCALL
2714 struct bpf_tcp_iter_state {
2715         struct tcp_iter_state state;
2716         unsigned int cur_sk;
2717         unsigned int end_sk;
2718         unsigned int max_sk;
2719         struct sock **batch;
2720         bool st_bucket_done;
2721 };
2722
2723 struct bpf_iter__tcp {
2724         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2725         __bpf_md_ptr(struct sock_common *, sk_common);
2726         uid_t uid __aligned(8);
2727 };
2728
2729 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2730                              struct sock_common *sk_common, uid_t uid)
2731 {
2732         struct bpf_iter__tcp ctx;
2733
2734         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2735         ctx.meta = meta;
2736         ctx.sk_common = sk_common;
2737         ctx.uid = uid;
2738         return bpf_iter_run_prog(prog, &ctx);
2739 }
2740
2741 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2742 {
2743         while (iter->cur_sk < iter->end_sk)
2744                 sock_put(iter->batch[iter->cur_sk++]);
2745 }
2746
2747 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2748                                       unsigned int new_batch_sz)
2749 {
2750         struct sock **new_batch;
2751
2752         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2753                              GFP_USER | __GFP_NOWARN);
2754         if (!new_batch)
2755                 return -ENOMEM;
2756
2757         bpf_iter_tcp_put_batch(iter);
2758         kvfree(iter->batch);
2759         iter->batch = new_batch;
2760         iter->max_sk = new_batch_sz;
2761
2762         return 0;
2763 }
2764
2765 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2766                                                  struct sock *start_sk)
2767 {
2768         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2769         struct bpf_tcp_iter_state *iter = seq->private;
2770         struct tcp_iter_state *st = &iter->state;
2771         struct hlist_nulls_node *node;
2772         unsigned int expected = 1;
2773         struct sock *sk;
2774
2775         sock_hold(start_sk);
2776         iter->batch[iter->end_sk++] = start_sk;
2777
2778         sk = sk_nulls_next(start_sk);
2779         sk_nulls_for_each_from(sk, node) {
2780                 if (seq_sk_match(seq, sk)) {
2781                         if (iter->end_sk < iter->max_sk) {
2782                                 sock_hold(sk);
2783                                 iter->batch[iter->end_sk++] = sk;
2784                         }
2785                         expected++;
2786                 }
2787         }
2788         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2789
2790         return expected;
2791 }
2792
2793 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2794                                                    struct sock *start_sk)
2795 {
2796         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2797         struct bpf_tcp_iter_state *iter = seq->private;
2798         struct tcp_iter_state *st = &iter->state;
2799         struct hlist_nulls_node *node;
2800         unsigned int expected = 1;
2801         struct sock *sk;
2802
2803         sock_hold(start_sk);
2804         iter->batch[iter->end_sk++] = start_sk;
2805
2806         sk = sk_nulls_next(start_sk);
2807         sk_nulls_for_each_from(sk, node) {
2808                 if (seq_sk_match(seq, sk)) {
2809                         if (iter->end_sk < iter->max_sk) {
2810                                 sock_hold(sk);
2811                                 iter->batch[iter->end_sk++] = sk;
2812                         }
2813                         expected++;
2814                 }
2815         }
2816         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2817
2818         return expected;
2819 }
2820
2821 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2822 {
2823         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2824         struct bpf_tcp_iter_state *iter = seq->private;
2825         struct tcp_iter_state *st = &iter->state;
2826         unsigned int expected;
2827         bool resized = false;
2828         struct sock *sk;
2829
2830         /* The st->bucket is done.  Directly advance to the next
2831          * bucket instead of having the tcp_seek_last_pos() to skip
2832          * one by one in the current bucket and eventually find out
2833          * it has to advance to the next bucket.
2834          */
2835         if (iter->st_bucket_done) {
2836                 st->offset = 0;
2837                 st->bucket++;
2838                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2839                     st->bucket > hinfo->lhash2_mask) {
2840                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2841                         st->bucket = 0;
2842                 }
2843         }
2844
2845 again:
2846         /* Get a new batch */
2847         iter->cur_sk = 0;
2848         iter->end_sk = 0;
2849         iter->st_bucket_done = false;
2850
2851         sk = tcp_seek_last_pos(seq);
2852         if (!sk)
2853                 return NULL; /* Done */
2854
2855         if (st->state == TCP_SEQ_STATE_LISTENING)
2856                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2857         else
2858                 expected = bpf_iter_tcp_established_batch(seq, sk);
2859
2860         if (iter->end_sk == expected) {
2861                 iter->st_bucket_done = true;
2862                 return sk;
2863         }
2864
2865         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2866                 resized = true;
2867                 goto again;
2868         }
2869
2870         return sk;
2871 }
2872
2873 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2874 {
2875         /* bpf iter does not support lseek, so it always
2876          * continue from where it was stop()-ped.
2877          */
2878         if (*pos)
2879                 return bpf_iter_tcp_batch(seq);
2880
2881         return SEQ_START_TOKEN;
2882 }
2883
2884 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2885 {
2886         struct bpf_tcp_iter_state *iter = seq->private;
2887         struct tcp_iter_state *st = &iter->state;
2888         struct sock *sk;
2889
2890         /* Whenever seq_next() is called, the iter->cur_sk is
2891          * done with seq_show(), so advance to the next sk in
2892          * the batch.
2893          */
2894         if (iter->cur_sk < iter->end_sk) {
2895                 /* Keeping st->num consistent in tcp_iter_state.
2896                  * bpf_iter_tcp does not use st->num.
2897                  * meta.seq_num is used instead.
2898                  */
2899                 st->num++;
2900                 /* Move st->offset to the next sk in the bucket such that
2901                  * the future start() will resume at st->offset in
2902                  * st->bucket.  See tcp_seek_last_pos().
2903                  */
2904                 st->offset++;
2905                 sock_put(iter->batch[iter->cur_sk++]);
2906         }
2907
2908         if (iter->cur_sk < iter->end_sk)
2909                 sk = iter->batch[iter->cur_sk];
2910         else
2911                 sk = bpf_iter_tcp_batch(seq);
2912
2913         ++*pos;
2914         /* Keeping st->last_pos consistent in tcp_iter_state.
2915          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2916          */
2917         st->last_pos = *pos;
2918         return sk;
2919 }
2920
2921 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2922 {
2923         struct bpf_iter_meta meta;
2924         struct bpf_prog *prog;
2925         struct sock *sk = v;
2926         bool slow;
2927         uid_t uid;
2928         int ret;
2929
2930         if (v == SEQ_START_TOKEN)
2931                 return 0;
2932
2933         if (sk_fullsock(sk))
2934                 slow = lock_sock_fast(sk);
2935
2936         if (unlikely(sk_unhashed(sk))) {
2937                 ret = SEQ_SKIP;
2938                 goto unlock;
2939         }
2940
2941         if (sk->sk_state == TCP_TIME_WAIT) {
2942                 uid = 0;
2943         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2944                 const struct request_sock *req = v;
2945
2946                 uid = from_kuid_munged(seq_user_ns(seq),
2947                                        sock_i_uid(req->rsk_listener));
2948         } else {
2949                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2950         }
2951
2952         meta.seq = seq;
2953         prog = bpf_iter_get_info(&meta, false);
2954         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2955
2956 unlock:
2957         if (sk_fullsock(sk))
2958                 unlock_sock_fast(sk, slow);
2959         return ret;
2960
2961 }
2962
2963 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2964 {
2965         struct bpf_tcp_iter_state *iter = seq->private;
2966         struct bpf_iter_meta meta;
2967         struct bpf_prog *prog;
2968
2969         if (!v) {
2970                 meta.seq = seq;
2971                 prog = bpf_iter_get_info(&meta, true);
2972                 if (prog)
2973                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2974         }
2975
2976         if (iter->cur_sk < iter->end_sk) {
2977                 bpf_iter_tcp_put_batch(iter);
2978                 iter->st_bucket_done = false;
2979         }
2980 }
2981
2982 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2983         .show           = bpf_iter_tcp_seq_show,
2984         .start          = bpf_iter_tcp_seq_start,
2985         .next           = bpf_iter_tcp_seq_next,
2986         .stop           = bpf_iter_tcp_seq_stop,
2987 };
2988 #endif
2989 static unsigned short seq_file_family(const struct seq_file *seq)
2990 {
2991         const struct tcp_seq_afinfo *afinfo;
2992
2993 #ifdef CONFIG_BPF_SYSCALL
2994         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2995         if (seq->op == &bpf_iter_tcp_seq_ops)
2996                 return AF_UNSPEC;
2997 #endif
2998
2999         /* Iterated from proc fs */
3000         afinfo = pde_data(file_inode(seq->file));
3001         return afinfo->family;
3002 }
3003
3004 static const struct seq_operations tcp4_seq_ops = {
3005         .show           = tcp4_seq_show,
3006         .start          = tcp_seq_start,
3007         .next           = tcp_seq_next,
3008         .stop           = tcp_seq_stop,
3009 };
3010
3011 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3012         .family         = AF_INET,
3013 };
3014
3015 static int __net_init tcp4_proc_init_net(struct net *net)
3016 {
3017         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3018                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3019                 return -ENOMEM;
3020         return 0;
3021 }
3022
3023 static void __net_exit tcp4_proc_exit_net(struct net *net)
3024 {
3025         remove_proc_entry("tcp", net->proc_net);
3026 }
3027
3028 static struct pernet_operations tcp4_net_ops = {
3029         .init = tcp4_proc_init_net,
3030         .exit = tcp4_proc_exit_net,
3031 };
3032
3033 int __init tcp4_proc_init(void)
3034 {
3035         return register_pernet_subsys(&tcp4_net_ops);
3036 }
3037
3038 void tcp4_proc_exit(void)
3039 {
3040         unregister_pernet_subsys(&tcp4_net_ops);
3041 }
3042 #endif /* CONFIG_PROC_FS */
3043
3044 /* @wake is one when sk_stream_write_space() calls us.
3045  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3046  * This mimics the strategy used in sock_def_write_space().
3047  */
3048 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3049 {
3050         const struct tcp_sock *tp = tcp_sk(sk);
3051         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3052                             READ_ONCE(tp->snd_nxt);
3053
3054         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3055 }
3056 EXPORT_SYMBOL(tcp_stream_memory_free);
3057
3058 struct proto tcp_prot = {
3059         .name                   = "TCP",
3060         .owner                  = THIS_MODULE,
3061         .close                  = tcp_close,
3062         .pre_connect            = tcp_v4_pre_connect,
3063         .connect                = tcp_v4_connect,
3064         .disconnect             = tcp_disconnect,
3065         .accept                 = inet_csk_accept,
3066         .ioctl                  = tcp_ioctl,
3067         .init                   = tcp_v4_init_sock,
3068         .destroy                = tcp_v4_destroy_sock,
3069         .shutdown               = tcp_shutdown,
3070         .setsockopt             = tcp_setsockopt,
3071         .getsockopt             = tcp_getsockopt,
3072         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3073         .keepalive              = tcp_set_keepalive,
3074         .recvmsg                = tcp_recvmsg,
3075         .sendmsg                = tcp_sendmsg,
3076         .sendpage               = tcp_sendpage,
3077         .backlog_rcv            = tcp_v4_do_rcv,
3078         .release_cb             = tcp_release_cb,
3079         .hash                   = inet_hash,
3080         .unhash                 = inet_unhash,
3081         .get_port               = inet_csk_get_port,
3082         .put_port               = inet_put_port,
3083 #ifdef CONFIG_BPF_SYSCALL
3084         .psock_update_sk_prot   = tcp_bpf_update_proto,
3085 #endif
3086         .enter_memory_pressure  = tcp_enter_memory_pressure,
3087         .leave_memory_pressure  = tcp_leave_memory_pressure,
3088         .stream_memory_free     = tcp_stream_memory_free,
3089         .sockets_allocated      = &tcp_sockets_allocated,
3090         .orphan_count           = &tcp_orphan_count,
3091
3092         .memory_allocated       = &tcp_memory_allocated,
3093         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3094
3095         .memory_pressure        = &tcp_memory_pressure,
3096         .sysctl_mem             = sysctl_tcp_mem,
3097         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3098         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3099         .max_header             = MAX_TCP_HEADER,
3100         .obj_size               = sizeof(struct tcp_sock),
3101         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3102         .twsk_prot              = &tcp_timewait_sock_ops,
3103         .rsk_prot               = &tcp_request_sock_ops,
3104         .h.hashinfo             = NULL,
3105         .no_autobind            = true,
3106         .diag_destroy           = tcp_abort,
3107 };
3108 EXPORT_SYMBOL(tcp_prot);
3109
3110 static void __net_exit tcp_sk_exit(struct net *net)
3111 {
3112         if (net->ipv4.tcp_congestion_control)
3113                 bpf_module_put(net->ipv4.tcp_congestion_control,
3114                                net->ipv4.tcp_congestion_control->owner);
3115 }
3116
3117 static void __net_init tcp_set_hashinfo(struct net *net)
3118 {
3119         struct inet_hashinfo *hinfo;
3120         unsigned int ehash_entries;
3121         struct net *old_net;
3122
3123         if (net_eq(net, &init_net))
3124                 goto fallback;
3125
3126         old_net = current->nsproxy->net_ns;
3127         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3128         if (!ehash_entries)
3129                 goto fallback;
3130
3131         ehash_entries = roundup_pow_of_two(ehash_entries);
3132         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3133         if (!hinfo) {
3134                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3135                         "for a netns, fallback to the global one\n",
3136                         ehash_entries);
3137 fallback:
3138                 hinfo = &tcp_hashinfo;
3139                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3140         }
3141
3142         net->ipv4.tcp_death_row.hashinfo = hinfo;
3143         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3144         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3145 }
3146
3147 static int __net_init tcp_sk_init(struct net *net)
3148 {
3149         net->ipv4.sysctl_tcp_ecn = 2;
3150         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3151
3152         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3153         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3154         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3155         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3156         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3157
3158         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3159         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3160         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3161
3162         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3163         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3164         net->ipv4.sysctl_tcp_syncookies = 1;
3165         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3166         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3167         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3168         net->ipv4.sysctl_tcp_orphan_retries = 0;
3169         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3170         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3171         net->ipv4.sysctl_tcp_tw_reuse = 2;
3172         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3173
3174         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3175         tcp_set_hashinfo(net);
3176
3177         net->ipv4.sysctl_tcp_sack = 1;
3178         net->ipv4.sysctl_tcp_window_scaling = 1;
3179         net->ipv4.sysctl_tcp_timestamps = 1;
3180         net->ipv4.sysctl_tcp_early_retrans = 3;
3181         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3182         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3183         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3184         net->ipv4.sysctl_tcp_max_reordering = 300;
3185         net->ipv4.sysctl_tcp_dsack = 1;
3186         net->ipv4.sysctl_tcp_app_win = 31;
3187         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3188         net->ipv4.sysctl_tcp_frto = 2;
3189         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3190         /* This limits the percentage of the congestion window which we
3191          * will allow a single TSO frame to consume.  Building TSO frames
3192          * which are too large can cause TCP streams to be bursty.
3193          */
3194         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3195         /* Default TSQ limit of 16 TSO segments */
3196         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3197
3198         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3199         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3200
3201         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3202         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3203         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3204         net->ipv4.sysctl_tcp_autocorking = 1;
3205         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3206         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3207         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3208         if (net != &init_net) {
3209                 memcpy(net->ipv4.sysctl_tcp_rmem,
3210                        init_net.ipv4.sysctl_tcp_rmem,
3211                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3212                 memcpy(net->ipv4.sysctl_tcp_wmem,
3213                        init_net.ipv4.sysctl_tcp_wmem,
3214                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3215         }
3216         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3217         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3218         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3219         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3220         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3221         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3222
3223         /* Reno is always built in */
3224         if (!net_eq(net, &init_net) &&
3225             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3226                                init_net.ipv4.tcp_congestion_control->owner))
3227                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3228         else
3229                 net->ipv4.tcp_congestion_control = &tcp_reno;
3230
3231         return 0;
3232 }
3233
3234 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3235 {
3236         struct net *net;
3237
3238         tcp_twsk_purge(net_exit_list, AF_INET);
3239
3240         list_for_each_entry(net, net_exit_list, exit_list) {
3241                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3242                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3243                 tcp_fastopen_ctx_destroy(net);
3244         }
3245 }
3246
3247 static struct pernet_operations __net_initdata tcp_sk_ops = {
3248        .init       = tcp_sk_init,
3249        .exit       = tcp_sk_exit,
3250        .exit_batch = tcp_sk_exit_batch,
3251 };
3252
3253 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3254 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3255                      struct sock_common *sk_common, uid_t uid)
3256
3257 #define INIT_BATCH_SZ 16
3258
3259 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3260 {
3261         struct bpf_tcp_iter_state *iter = priv_data;
3262         int err;
3263
3264         err = bpf_iter_init_seq_net(priv_data, aux);
3265         if (err)
3266                 return err;
3267
3268         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3269         if (err) {
3270                 bpf_iter_fini_seq_net(priv_data);
3271                 return err;
3272         }
3273
3274         return 0;
3275 }
3276
3277 static void bpf_iter_fini_tcp(void *priv_data)
3278 {
3279         struct bpf_tcp_iter_state *iter = priv_data;
3280
3281         bpf_iter_fini_seq_net(priv_data);
3282         kvfree(iter->batch);
3283 }
3284
3285 static const struct bpf_iter_seq_info tcp_seq_info = {
3286         .seq_ops                = &bpf_iter_tcp_seq_ops,
3287         .init_seq_private       = bpf_iter_init_tcp,
3288         .fini_seq_private       = bpf_iter_fini_tcp,
3289         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3290 };
3291
3292 static const struct bpf_func_proto *
3293 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3294                             const struct bpf_prog *prog)
3295 {
3296         switch (func_id) {
3297         case BPF_FUNC_setsockopt:
3298                 return &bpf_sk_setsockopt_proto;
3299         case BPF_FUNC_getsockopt:
3300                 return &bpf_sk_getsockopt_proto;
3301         default:
3302                 return NULL;
3303         }
3304 }
3305
3306 static struct bpf_iter_reg tcp_reg_info = {
3307         .target                 = "tcp",
3308         .ctx_arg_info_size      = 1,
3309         .ctx_arg_info           = {
3310                 { offsetof(struct bpf_iter__tcp, sk_common),
3311                   PTR_TO_BTF_ID_OR_NULL },
3312         },
3313         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3314         .seq_info               = &tcp_seq_info,
3315 };
3316
3317 static void __init bpf_iter_register(void)
3318 {
3319         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3320         if (bpf_iter_reg_target(&tcp_reg_info))
3321                 pr_warn("Warning: could not register bpf iterator tcp\n");
3322 }
3323
3324 #endif
3325
3326 void __init tcp_v4_init(void)
3327 {
3328         int cpu, res;
3329
3330         for_each_possible_cpu(cpu) {
3331                 struct sock *sk;
3332
3333                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3334                                            IPPROTO_TCP, &init_net);
3335                 if (res)
3336                         panic("Failed to create the TCP control socket.\n");
3337                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3338
3339                 /* Please enforce IP_DF and IPID==0 for RST and
3340                  * ACK sent in SYN-RECV and TIME-WAIT state.
3341                  */
3342                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3343
3344                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3345         }
3346         if (register_pernet_subsys(&tcp_sk_ops))
3347                 panic("Failed to create the TCP control socket.\n");
3348
3349 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3350         bpf_iter_register();
3351 #endif
3352 }