net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 112         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 113         struct tcp_sock *tp = tcp_sk(sk);
 114         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203         struct inet_sock *inet = inet_sk(sk);
 204         struct tcp_sock *tp = tcp_sk(sk);
 205         __be16 orig_sport, orig_dport;
 206         __be32 daddr, nexthop;
 207         struct flowi4 *fl4;
 208         struct rtable *rt;
 209         int err;
 210         struct ip_options_rcu *inet_opt;
 211         struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 212
 213         if (addr_len < sizeof(struct sockaddr_in))
 214                 return -EINVAL;
 215
 216         if (usin->sin_family != AF_INET)
 217                 return -EAFNOSUPPORT;
 218
 219         nexthop = daddr = usin->sin_addr.s_addr;
 220         inet_opt = rcu_dereference_protected(inet->inet_opt,
 221                                              lockdep_sock_is_held(sk));
 222         if (inet_opt && inet_opt->opt.srr) {
 223                 if (!daddr)
 224                         return -EINVAL;
 225                 nexthop = inet_opt->opt.faddr;
 226         }
 227
 228         orig_sport = inet->inet_sport;
 229         orig_dport = usin->sin_port;
 230         fl4 = &inet->cork.fl.u.ip4;
 231         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 232                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 233                               IPPROTO_TCP,
 234                               orig_sport, orig_dport, sk);
 235         if (IS_ERR(rt)) {
 236                 err = PTR_ERR(rt);
 237                 if (err == -ENETUNREACH)
 238                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 239                 return err;
 240         }
 241
 242         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 243                 ip_rt_put(rt);
 244                 return -ENETUNREACH;
 245         }
 246
 247         if (!inet_opt || !inet_opt->opt.srr)
 248                 daddr = fl4->daddr;
 249
 250         if (!inet->inet_saddr)
 251                 inet->inet_saddr = fl4->saddr;
 252         sk_rcv_saddr_set(sk, inet->inet_saddr);
 253
 254         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 255                 /* Reset inherited state */
 256                 tp->rx_opt.ts_recent       = 0;
 257                 tp->rx_opt.ts_recent_stamp = 0;
 258                 if (likely(!tp->repair))
 259                         WRITE_ONCE(tp->write_seq, 0);
 260         }
 261
 262         inet->inet_dport = usin->sin_port;
 263         sk_daddr_set(sk, daddr);
 264
 265         inet_csk(sk)->icsk_ext_hdr_len = 0;
 266         if (inet_opt)
 267                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 268
 269         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 270
 271         /* Socket identity is still unknown (sport may be zero).
 272          * However we set state to SYN-SENT and not releasing socket
 273          * lock select source port, enter ourselves into the hash tables and
 274          * complete initialization after this.
 275          */
 276         tcp_set_state(sk, TCP_SYN_SENT);
 277         err = inet_hash_connect(tcp_death_row, sk);
 278         if (err)
 279                 goto failure;
 280
 281         sk_set_txhash(sk);
 282
 283         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 284                                inet->inet_sport, inet->inet_dport, sk);
 285         if (IS_ERR(rt)) {
 286                 err = PTR_ERR(rt);
 287                 rt = NULL;
 288                 goto failure;
 289         }
 290         /* OK, now commit destination to socket.  */
 291         sk->sk_gso_type = SKB_GSO_TCPV4;
 292         sk_setup_caps(sk, &rt->dst);
 293         rt = NULL;
 294
 295         if (likely(!tp->repair)) {
 296                 if (!tp->write_seq)
 297                         WRITE_ONCE(tp->write_seq,
 298                                    secure_tcp_seq(inet->inet_saddr,
 299                                                   inet->inet_daddr,
 300                                                   inet->inet_sport,
 301                                                   usin->sin_port));
 302                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 303                                                  inet->inet_saddr,
 304                                                  inet->inet_daddr);
 305         }
 306
 307         inet->inet_id = prandom_u32();
 308
 309         if (tcp_fastopen_defer_connect(sk, &err))
 310                 return err;
 311         if (err)
 312                 goto failure;
 313
 314         err = tcp_connect(sk);
 315
 316         if (err)
 317                 goto failure;
 318
 319         return 0;
 320
 321 failure:
 322         /*
 323          * This unhashes the socket and releases the local port,
 324          * if necessary.
 325          */
 326         tcp_set_state(sk, TCP_CLOSE);
 327         ip_rt_put(rt);
 328         sk->sk_route_caps = 0;
 329         inet->inet_dport = 0;
 330         return err;
 331 }
 332 EXPORT_SYMBOL(tcp_v4_connect);
 333
 334 /*
 335  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 336  * It can be called through tcp_release_cb() if socket was owned by user
 337  * at the time tcp_v4_err() was called to handle ICMP message.
 338  */
 339 void tcp_v4_mtu_reduced(struct sock *sk)
 340 {
 341         struct inet_sock *inet = inet_sk(sk);
 342         struct dst_entry *dst;
 343         u32 mtu;
 344
 345         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 346                 return;
 347         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 348         dst = inet_csk_update_pmtu(sk, mtu);
 349         if (!dst)
 350                 return;
 351
 352         /* Something is about to be wrong... Remember soft error
 353          * for the case, if this connection will not able to recover.
 354          */
 355         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 356                 sk->sk_err_soft = EMSGSIZE;
 357
 358         mtu = dst_mtu(dst);
 359
 360         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 361             ip_sk_accept_pmtu(sk) &&
 362             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 363                 tcp_sync_mss(sk, mtu);
 364
 365                 /* Resend the TCP packet because it's
 366                  * clear that the old packet has been
 367                  * dropped. This is the new "fast" path mtu
 368                  * discovery.
 369                  */
 370                 tcp_simple_retransmit(sk);
 371         } /* else let the usual retransmit timer handle it */
 372 }
 373 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 374
 375 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 376 {
 377         struct dst_entry *dst = __sk_dst_check(sk, 0);
 378
 379         if (dst)
 380                 dst->ops->redirect(dst, sk, skb);
 381 }
 382
 383
 384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 385 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 386 {
 387         struct request_sock *req = inet_reqsk(sk);
 388         struct net *net = sock_net(sk);
 389
 390         /* ICMPs are not backlogged, hence we cannot get
 391          * an established socket here.
 392          */
 393         if (seq != tcp_rsk(req)->snt_isn) {
 394                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 395         } else if (abort) {
 396                 /*
 397                  * Still in SYN_RECV, just remove it silently.
 398                  * There is no good way to pass the error to the newly
 399                  * created socket, and POSIX does not want network
 400                  * errors returned from accept().
 401                  */
 402                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 403                 tcp_listendrop(req->rsk_listener);
 404         }
 405         reqsk_put(req);
 406 }
 407 EXPORT_SYMBOL(tcp_req_err);
 408
 409 /* TCP-LD (RFC 6069) logic */
 410 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 411 {
 412         struct inet_connection_sock *icsk = inet_csk(sk);
 413         struct tcp_sock *tp = tcp_sk(sk);
 414         struct sk_buff *skb;
 415         s32 remaining;
 416         u32 delta_us;
 417
 418         if (sock_owned_by_user(sk))
 419                 return;
 420
 421         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 422             !icsk->icsk_backoff)
 423                 return;
 424
 425         skb = tcp_rtx_queue_head(sk);
 426         if (WARN_ON_ONCE(!skb))
 427                 return;
 428
 429         icsk->icsk_backoff--;
 430         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 431         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 432
 433         tcp_mstamp_refresh(tp);
 434         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 435         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 436
 437         if (remaining > 0) {
 438                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 439                                           remaining, TCP_RTO_MAX);
 440         } else {
 441                 /* RTO revert clocked out retransmission.
 442                  * Will retransmit now.
 443                  */
 444                 tcp_retransmit_timer(sk);
 445         }
 446 }
 447 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 448
 449 /*
 450  * This routine is called by the ICMP module when it gets some
 451  * sort of error condition.  If err < 0 then the socket should
 452  * be closed and the error returned to the user.  If err > 0
 453  * it's just the icmp type << 8 | icmp code.  After adjustment
 454  * header points to the first 8 bytes of the tcp header.  We need
 455  * to find the appropriate port.
 456  *
 457  * The locking strategy used here is very "optimistic". When
 458  * someone else accesses the socket the ICMP is just dropped
 459  * and for some paths there is no check at all.
 460  * A more general error queue to queue errors for later handling
 461  * is probably better.
 462  *
 463  */
 464
 465 int tcp_v4_err(struct sk_buff *skb, u32 info)
 466 {
 467         const struct iphdr *iph = (const struct iphdr *)skb->data;
 468         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 469         struct tcp_sock *tp;
 470         struct inet_sock *inet;
 471         const int type = icmp_hdr(skb)->type;
 472         const int code = icmp_hdr(skb)->code;
 473         struct sock *sk;
 474         struct request_sock *fastopen;
 475         u32 seq, snd_una;
 476         int err;
 477         struct net *net = dev_net(skb->dev);
 478
 479         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 480                                        th->dest, iph->saddr, ntohs(th->source),
 481                                        inet_iif(skb), 0);
 482         if (!sk) {
 483                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 484                 return -ENOENT;
 485         }
 486         if (sk->sk_state == TCP_TIME_WAIT) {
 487                 inet_twsk_put(inet_twsk(sk));
 488                 return 0;
 489         }
 490         seq = ntohl(th->seq);
 491         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 492                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 493                                      type == ICMP_TIME_EXCEEDED ||
 494                                      (type == ICMP_DEST_UNREACH &&
 495                                       (code == ICMP_NET_UNREACH ||
 496                                        code == ICMP_HOST_UNREACH)));
 497                 return 0;
 498         }
 499
 500         bh_lock_sock(sk);
 501         /* If too many ICMPs get dropped on busy
 502          * servers this needs to be solved differently.
 503          * We do take care of PMTU discovery (RFC1191) special case :
 504          * we can receive locally generated ICMP messages while socket is held.
 505          */
 506         if (sock_owned_by_user(sk)) {
 507                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 508                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 509         }
 510         if (sk->sk_state == TCP_CLOSE)
 511                 goto out;
 512
 513         if (static_branch_unlikely(&ip4_min_ttl)) {
 514                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 515                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 516                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 517                         goto out;
 518                 }
 519         }
 520
 521         tp = tcp_sk(sk);
 522         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 523         fastopen = rcu_dereference(tp->fastopen_rsk);
 524         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 525         if (sk->sk_state != TCP_LISTEN &&
 526             !between(seq, snd_una, tp->snd_nxt)) {
 527                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 528                 goto out;
 529         }
 530
 531         switch (type) {
 532         case ICMP_REDIRECT:
 533                 if (!sock_owned_by_user(sk))
 534                         do_redirect(skb, sk);
 535                 goto out;
 536         case ICMP_SOURCE_QUENCH:
 537                 /* Just silently ignore these. */
 538                 goto out;
 539         case ICMP_PARAMETERPROB:
 540                 err = EPROTO;
 541                 break;
 542         case ICMP_DEST_UNREACH:
 543                 if (code > NR_ICMP_UNREACH)
 544                         goto out;
 545
 546                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 547                         /* We are not interested in TCP_LISTEN and open_requests
 548                          * (SYN-ACKs send out by Linux are always <576bytes so
 549                          * they should go through unfragmented).
 550                          */
 551                         if (sk->sk_state == TCP_LISTEN)
 552                                 goto out;
 553
 554                         WRITE_ONCE(tp->mtu_info, info);
 555                         if (!sock_owned_by_user(sk)) {
 556                                 tcp_v4_mtu_reduced(sk);
 557                         } else {
 558                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 559                                         sock_hold(sk);
 560                         }
 561                         goto out;
 562                 }
 563
 564                 err = icmp_err_convert[code].errno;
 565                 /* check if this ICMP message allows revert of backoff.
 566                  * (see RFC 6069)
 567                  */
 568                 if (!fastopen &&
 569                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 570                         tcp_ld_RTO_revert(sk, seq);
 571                 break;
 572         case ICMP_TIME_EXCEEDED:
 573                 err = EHOSTUNREACH;
 574                 break;
 575         default:
 576                 goto out;
 577         }
 578
 579         switch (sk->sk_state) {
 580         case TCP_SYN_SENT:
 581         case TCP_SYN_RECV:
 582                 /* Only in fast or simultaneous open. If a fast open socket is
 583                  * already accepted it is treated as a connected one below.
 584                  */
 585                 if (fastopen && !fastopen->sk)
 586                         break;
 587
 588                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 589
 590                 if (!sock_owned_by_user(sk)) {
 591                         sk->sk_err = err;
 592
 593                         sk_error_report(sk);
 594
 595                         tcp_done(sk);
 596                 } else {
 597                         sk->sk_err_soft = err;
 598                 }
 599                 goto out;
 600         }
 601
 602         /* If we've already connected we will keep trying
 603          * until we time out, or the user gives up.
 604          *
 605          * rfc1122 4.2.3.9 allows to consider as hard errors
 606          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 607          * but it is obsoleted by pmtu discovery).
 608          *
 609          * Note, that in modern internet, where routing is unreliable
 610          * and in each dark corner broken firewalls sit, sending random
 611          * errors ordered by their masters even this two messages finally lose
 612          * their original sense (even Linux sends invalid PORT_UNREACHs)
 613          *
 614          * Now we are in compliance with RFCs.
 615          *                                                      --ANK (980905)
 616          */
 617
 618         inet = inet_sk(sk);
 619         if (!sock_owned_by_user(sk) && inet->recverr) {
 620                 sk->sk_err = err;
 621                 sk_error_report(sk);
 622         } else  { /* Only an error on timeout */
 623                 sk->sk_err_soft = err;
 624         }
 625
 626 out:
 627         bh_unlock_sock(sk);
 628         sock_put(sk);
 629         return 0;
 630 }
 631
 632 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 633 {
 634         struct tcphdr *th = tcp_hdr(skb);
 635
 636         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 637         skb->csum_start = skb_transport_header(skb) - skb->head;
 638         skb->csum_offset = offsetof(struct tcphdr, check);
 639 }
 640
 641 /* This routine computes an IPv4 TCP checksum. */
 642 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 643 {
 644         const struct inet_sock *inet = inet_sk(sk);
 645
 646         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 647 }
 648 EXPORT_SYMBOL(tcp_v4_send_check);
 649
 650 /*
 651  *      This routine will send an RST to the other tcp.
 652  *
 653  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 654  *                    for reset.
 655  *      Answer: if a packet caused RST, it is not for a socket
 656  *              existing in our system, if it is matched to a socket,
 657  *              it is just duplicate segment or bug in other side's TCP.
 658  *              So that we build reply only basing on parameters
 659  *              arrived with segment.
 660  *      Exception: precedence violation. We do not implement it in any case.
 661  */
 662
 663 #ifdef CONFIG_TCP_MD5SIG
 664 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 665 #else
 666 #define OPTION_BYTES sizeof(__be32)
 667 #endif
 668
 669 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 670 {
 671         const struct tcphdr *th = tcp_hdr(skb);
 672         struct {
 673                 struct tcphdr th;
 674                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 675         } rep;
 676         struct ip_reply_arg arg;
 677 #ifdef CONFIG_TCP_MD5SIG
 678         struct tcp_md5sig_key *key = NULL;
 679         const __u8 *hash_location = NULL;
 680         unsigned char newhash[16];
 681         int genhash;
 682         struct sock *sk1 = NULL;
 683 #endif
 684         u64 transmit_time = 0;
 685         struct sock *ctl_sk;
 686         struct net *net;
 687
 688         /* Never send a reset in response to a reset. */
 689         if (th->rst)
 690                 return;
 691
 692         /* If sk not NULL, it means we did a successful lookup and incoming
 693          * route had to be correct. prequeue might have dropped our dst.
 694          */
 695         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 696                 return;
 697
 698         /* Swap the send and the receive. */
 699         memset(&rep, 0, sizeof(rep));
 700         rep.th.dest   = th->source;
 701         rep.th.source = th->dest;
 702         rep.th.doff   = sizeof(struct tcphdr) / 4;
 703         rep.th.rst    = 1;
 704
 705         if (th->ack) {
 706                 rep.th.seq = th->ack_seq;
 707         } else {
 708                 rep.th.ack = 1;
 709                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 710                                        skb->len - (th->doff << 2));
 711         }
 712
 713         memset(&arg, 0, sizeof(arg));
 714         arg.iov[0].iov_base = (unsigned char *)&rep;
 715         arg.iov[0].iov_len  = sizeof(rep.th);
 716
 717         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 718 #ifdef CONFIG_TCP_MD5SIG
 719         rcu_read_lock();
 720         hash_location = tcp_parse_md5sig_option(th);
 721         if (sk && sk_fullsock(sk)) {
 722                 const union tcp_md5_addr *addr;
 723                 int l3index;
 724
 725                 /* sdif set, means packet ingressed via a device
 726                  * in an L3 domain and inet_iif is set to it.
 727                  */
 728                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 729                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 730                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 731         } else if (hash_location) {
 732                 const union tcp_md5_addr *addr;
 733                 int sdif = tcp_v4_sdif(skb);
 734                 int dif = inet_iif(skb);
 735                 int l3index;
 736
 737                 /*
 738                  * active side is lost. Try to find listening socket through
 739                  * source port, and then find md5 key through listening socket.
 740                  * we are not loose security here:
 741                  * Incoming packet is checked with md5 hash with finding key,
 742                  * no RST generated if md5 hash doesn't match.
 743                  */
 744                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 745                                              ip_hdr(skb)->saddr,
 746                                              th->source, ip_hdr(skb)->daddr,
 747                                              ntohs(th->source), dif, sdif);
 748                 /* don't send rst if it can't find key */
 749                 if (!sk1)
 750                         goto out;
 751
 752                 /* sdif set, means packet ingressed via a device
 753                  * in an L3 domain and dif is set to it.
 754                  */
 755                 l3index = sdif ? dif : 0;
 756                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 757                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 758                 if (!key)
 759                         goto out;
 760
 761
 762                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 763                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 764                         goto out;
 765
 766         }
 767
 768         if (key) {
 769                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 770                                    (TCPOPT_NOP << 16) |
 771                                    (TCPOPT_MD5SIG << 8) |
 772                                    TCPOLEN_MD5SIG);
 773                 /* Update length and the length the header thinks exists */
 774                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 775                 rep.th.doff = arg.iov[0].iov_len / 4;
 776
 777                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 778                                      key, ip_hdr(skb)->saddr,
 779                                      ip_hdr(skb)->daddr, &rep.th);
 780         }
 781 #endif
 782         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 783         if (rep.opt[0] == 0) {
 784                 __be32 mrst = mptcp_reset_option(skb);
 785
 786                 if (mrst) {
 787                         rep.opt[0] = mrst;
 788                         arg.iov[0].iov_len += sizeof(mrst);
 789                         rep.th.doff = arg.iov[0].iov_len / 4;
 790                 }
 791         }
 792
 793         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 794                                       ip_hdr(skb)->saddr, /* XXX */
 795                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 796         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 797         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 798
 799         /* When socket is gone, all binding information is lost.
 800          * routing might fail in this case. No choice here, if we choose to force
 801          * input interface, we will misroute in case of asymmetric route.
 802          */
 803         if (sk) {
 804                 arg.bound_dev_if = sk->sk_bound_dev_if;
 805                 if (sk_fullsock(sk))
 806                         trace_tcp_send_reset(sk, skb);
 807         }
 808
 809         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 810                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 811
 812         arg.tos = ip_hdr(skb)->tos;
 813         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 814         local_bh_disable();
 815         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 816         sock_net_set(ctl_sk, net);
 817         if (sk) {
 818                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 819                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 820                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 821                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 822                 transmit_time = tcp_transmit_time(sk);
 823         }
 824         ip_send_unicast_reply(ctl_sk,
 825                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 826                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 827                               &arg, arg.iov[0].iov_len,
 828                               transmit_time);
 829
 830         ctl_sk->sk_mark = 0;
 831         sock_net_set(ctl_sk, &init_net);
 832         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 833         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 834         local_bh_enable();
 835
 836 #ifdef CONFIG_TCP_MD5SIG
 837 out:
 838         rcu_read_unlock();
 839 #endif
 840 }
 841
 842 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 843    outside socket context is ugly, certainly. What can I do?
 844  */
 845
 846 static void tcp_v4_send_ack(const struct sock *sk,
 847                             struct sk_buff *skb, u32 seq, u32 ack,
 848                             u32 win, u32 tsval, u32 tsecr, int oif,
 849                             struct tcp_md5sig_key *key,
 850                             int reply_flags, u8 tos)
 851 {
 852         const struct tcphdr *th = tcp_hdr(skb);
 853         struct {
 854                 struct tcphdr th;
 855                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 856 #ifdef CONFIG_TCP_MD5SIG
 857                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 858 #endif
 859                         ];
 860         } rep;
 861         struct net *net = sock_net(sk);
 862         struct ip_reply_arg arg;
 863         struct sock *ctl_sk;
 864         u64 transmit_time;
 865
 866         memset(&rep.th, 0, sizeof(struct tcphdr));
 867         memset(&arg, 0, sizeof(arg));
 868
 869         arg.iov[0].iov_base = (unsigned char *)&rep;
 870         arg.iov[0].iov_len  = sizeof(rep.th);
 871         if (tsecr) {
 872                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 873                                    (TCPOPT_TIMESTAMP << 8) |
 874                                    TCPOLEN_TIMESTAMP);
 875                 rep.opt[1] = htonl(tsval);
 876                 rep.opt[2] = htonl(tsecr);
 877                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 878         }
 879
 880         /* Swap the send and the receive. */
 881         rep.th.dest    = th->source;
 882         rep.th.source  = th->dest;
 883         rep.th.doff    = arg.iov[0].iov_len / 4;
 884         rep.th.seq     = htonl(seq);
 885         rep.th.ack_seq = htonl(ack);
 886         rep.th.ack     = 1;
 887         rep.th.window  = htons(win);
 888
 889 #ifdef CONFIG_TCP_MD5SIG
 890         if (key) {
 891                 int offset = (tsecr) ? 3 : 0;
 892
 893                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 894                                           (TCPOPT_NOP << 16) |
 895                                           (TCPOPT_MD5SIG << 8) |
 896                                           TCPOLEN_MD5SIG);
 897                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 898                 rep.th.doff = arg.iov[0].iov_len/4;
 899
 900                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 901                                     key, ip_hdr(skb)->saddr,
 902                                     ip_hdr(skb)->daddr, &rep.th);
 903         }
 904 #endif
 905         arg.flags = reply_flags;
 906         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 907                                       ip_hdr(skb)->saddr, /* XXX */
 908                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 909         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 910         if (oif)
 911                 arg.bound_dev_if = oif;
 912         arg.tos = tos;
 913         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 914         local_bh_disable();
 915         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 916         sock_net_set(ctl_sk, net);
 917         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 918                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 919         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 920                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 921         transmit_time = tcp_transmit_time(sk);
 922         ip_send_unicast_reply(ctl_sk,
 923                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 924                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 925                               &arg, arg.iov[0].iov_len,
 926                               transmit_time);
 927
 928         ctl_sk->sk_mark = 0;
 929         sock_net_set(ctl_sk, &init_net);
 930         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 931         local_bh_enable();
 932 }
 933
 934 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 935 {
 936         struct inet_timewait_sock *tw = inet_twsk(sk);
 937         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 938
 939         tcp_v4_send_ack(sk, skb,
 940                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 941                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 942                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 943                         tcptw->tw_ts_recent,
 944                         tw->tw_bound_dev_if,
 945                         tcp_twsk_md5_key(tcptw),
 946                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 947                         tw->tw_tos
 948                         );
 949
 950         inet_twsk_put(tw);
 951 }
 952
 953 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 954                                   struct request_sock *req)
 955 {
 956         const union tcp_md5_addr *addr;
 957         int l3index;
 958
 959         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 960          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 961          */
 962         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 963                                              tcp_sk(sk)->snd_nxt;
 964
 965         /* RFC 7323 2.3
 966          * The window field (SEG.WND) of every outgoing segment, with the
 967          * exception of <SYN> segments, MUST be right-shifted by
 968          * Rcv.Wind.Shift bits:
 969          */
 970         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 971         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 972         tcp_v4_send_ack(sk, skb, seq,
 973                         tcp_rsk(req)->rcv_nxt,
 974                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 975                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 976                         req->ts_recent,
 977                         0,
 978                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 979                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 980                         ip_hdr(skb)->tos);
 981 }
 982
 983 /*
 984  *      Send a SYN-ACK after having received a SYN.
 985  *      This still operates on a request_sock only, not on a big
 986  *      socket.
 987  */
 988 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 989                               struct flowi *fl,
 990                               struct request_sock *req,
 991                               struct tcp_fastopen_cookie *foc,
 992                               enum tcp_synack_type synack_type,
 993                               struct sk_buff *syn_skb)
 994 {
 995         const struct inet_request_sock *ireq = inet_rsk(req);
 996         struct flowi4 fl4;
 997         int err = -1;
 998         struct sk_buff *skb;
 999         u8 tos;
1000
1001         /* First, grab a route. */
1002         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1003                 return -1;
1004
1005         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1006
1007         if (skb) {
1008                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1009
1010                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1011                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1012                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1013                                 inet_sk(sk)->tos;
1014
1015                 if (!INET_ECN_is_capable(tos) &&
1016                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1017                         tos |= INET_ECN_ECT_0;
1018
1019                 rcu_read_lock();
1020                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1021                                             ireq->ir_rmt_addr,
1022                                             rcu_dereference(ireq->ireq_opt),
1023                                             tos);
1024                 rcu_read_unlock();
1025                 err = net_xmit_eval(err);
1026         }
1027
1028         return err;
1029 }
1030
1031 /*
1032  *      IPv4 request_sock destructor.
1033  */
1034 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1035 {
1036         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1037 }
1038
1039 #ifdef CONFIG_TCP_MD5SIG
1040 /*
1041  * RFC2385 MD5 checksumming requires a mapping of
1042  * IP address->MD5 Key.
1043  * We need to maintain these in the sk structure.
1044  */
1045
1046 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1047 EXPORT_SYMBOL(tcp_md5_needed);
1048
1049 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1050 {
1051         if (!old)
1052                 return true;
1053
1054         /* l3index always overrides non-l3index */
1055         if (old->l3index && new->l3index == 0)
1056                 return false;
1057         if (old->l3index == 0 && new->l3index)
1058                 return true;
1059
1060         return old->prefixlen < new->prefixlen;
1061 }
1062
1063 /* Find the Key structure for an address.  */
1064 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1065                                            const union tcp_md5_addr *addr,
1066                                            int family)
1067 {
1068         const struct tcp_sock *tp = tcp_sk(sk);
1069         struct tcp_md5sig_key *key;
1070         const struct tcp_md5sig_info *md5sig;
1071         __be32 mask;
1072         struct tcp_md5sig_key *best_match = NULL;
1073         bool match;
1074
1075         /* caller either holds rcu_read_lock() or socket lock */
1076         md5sig = rcu_dereference_check(tp->md5sig_info,
1077                                        lockdep_sock_is_held(sk));
1078         if (!md5sig)
1079                 return NULL;
1080
1081         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1082                                  lockdep_sock_is_held(sk)) {
1083                 if (key->family != family)
1084                         continue;
1085                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1086                         continue;
1087                 if (family == AF_INET) {
1088                         mask = inet_make_mask(key->prefixlen);
1089                         match = (key->addr.a4.s_addr & mask) ==
1090                                 (addr->a4.s_addr & mask);
1091 #if IS_ENABLED(CONFIG_IPV6)
1092                 } else if (family == AF_INET6) {
1093                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1094                                                   key->prefixlen);
1095 #endif
1096                 } else {
1097                         match = false;
1098                 }
1099
1100                 if (match && better_md5_match(best_match, key))
1101                         best_match = key;
1102         }
1103         return best_match;
1104 }
1105 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1106
1107 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1108                                                       const union tcp_md5_addr *addr,
1109                                                       int family, u8 prefixlen,
1110                                                       int l3index, u8 flags)
1111 {
1112         const struct tcp_sock *tp = tcp_sk(sk);
1113         struct tcp_md5sig_key *key;
1114         unsigned int size = sizeof(struct in_addr);
1115         const struct tcp_md5sig_info *md5sig;
1116
1117         /* caller either holds rcu_read_lock() or socket lock */
1118         md5sig = rcu_dereference_check(tp->md5sig_info,
1119                                        lockdep_sock_is_held(sk));
1120         if (!md5sig)
1121                 return NULL;
1122 #if IS_ENABLED(CONFIG_IPV6)
1123         if (family == AF_INET6)
1124                 size = sizeof(struct in6_addr);
1125 #endif
1126         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1127                                  lockdep_sock_is_held(sk)) {
1128                 if (key->family != family)
1129                         continue;
1130                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1131                         continue;
1132                 if (key->l3index != l3index)
1133                         continue;
1134                 if (!memcmp(&key->addr, addr, size) &&
1135                     key->prefixlen == prefixlen)
1136                         return key;
1137         }
1138         return NULL;
1139 }
1140
1141 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1142                                          const struct sock *addr_sk)
1143 {
1144         const union tcp_md5_addr *addr;
1145         int l3index;
1146
1147         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1148                                                  addr_sk->sk_bound_dev_if);
1149         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1150         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1151 }
1152 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1153
1154 /* This can be called on a newly created socket, from other files */
1155 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1156                    int family, u8 prefixlen, int l3index, u8 flags,
1157                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1158 {
1159         /* Add Key to the list */
1160         struct tcp_md5sig_key *key;
1161         struct tcp_sock *tp = tcp_sk(sk);
1162         struct tcp_md5sig_info *md5sig;
1163
1164         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1165         if (key) {
1166                 /* Pre-existing entry - just update that one.
1167                  * Note that the key might be used concurrently.
1168                  * data_race() is telling kcsan that we do not care of
1169                  * key mismatches, since changing MD5 key on live flows
1170                  * can lead to packet drops.
1171                  */
1172                 data_race(memcpy(key->key, newkey, newkeylen));
1173
1174                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1175                  * Also note that a reader could catch new key->keylen value
1176                  * but old key->key[], this is the reason we use __GFP_ZERO
1177                  * at sock_kmalloc() time below these lines.
1178                  */
1179                 WRITE_ONCE(key->keylen, newkeylen);
1180
1181                 return 0;
1182         }
1183
1184         md5sig = rcu_dereference_protected(tp->md5sig_info,
1185                                            lockdep_sock_is_held(sk));
1186         if (!md5sig) {
1187                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1188                 if (!md5sig)
1189                         return -ENOMEM;
1190
1191                 sk_gso_disable(sk);
1192                 INIT_HLIST_HEAD(&md5sig->head);
1193                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1194         }
1195
1196         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1197         if (!key)
1198                 return -ENOMEM;
1199         if (!tcp_alloc_md5sig_pool()) {
1200                 sock_kfree_s(sk, key, sizeof(*key));
1201                 return -ENOMEM;
1202         }
1203
1204         memcpy(key->key, newkey, newkeylen);
1205         key->keylen = newkeylen;
1206         key->family = family;
1207         key->prefixlen = prefixlen;
1208         key->l3index = l3index;
1209         key->flags = flags;
1210         memcpy(&key->addr, addr,
1211                (family == AF_INET6) ? sizeof(struct in6_addr) :
1212                                       sizeof(struct in_addr));
1213         hlist_add_head_rcu(&key->node, &md5sig->head);
1214         return 0;
1215 }
1216 EXPORT_SYMBOL(tcp_md5_do_add);
1217
1218 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1219                    u8 prefixlen, int l3index, u8 flags)
1220 {
1221         struct tcp_md5sig_key *key;
1222
1223         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1224         if (!key)
1225                 return -ENOENT;
1226         hlist_del_rcu(&key->node);
1227         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1228         kfree_rcu(key, rcu);
1229         return 0;
1230 }
1231 EXPORT_SYMBOL(tcp_md5_do_del);
1232
1233 static void tcp_clear_md5_list(struct sock *sk)
1234 {
1235         struct tcp_sock *tp = tcp_sk(sk);
1236         struct tcp_md5sig_key *key;
1237         struct hlist_node *n;
1238         struct tcp_md5sig_info *md5sig;
1239
1240         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1241
1242         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1243                 hlist_del_rcu(&key->node);
1244                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1245                 kfree_rcu(key, rcu);
1246         }
1247 }
1248
1249 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1250                                  sockptr_t optval, int optlen)
1251 {
1252         struct tcp_md5sig cmd;
1253         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1254         const union tcp_md5_addr *addr;
1255         u8 prefixlen = 32;
1256         int l3index = 0;
1257         u8 flags;
1258
1259         if (optlen < sizeof(cmd))
1260                 return -EINVAL;
1261
1262         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1263                 return -EFAULT;
1264
1265         if (sin->sin_family != AF_INET)
1266                 return -EINVAL;
1267
1268         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1269
1270         if (optname == TCP_MD5SIG_EXT &&
1271             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1272                 prefixlen = cmd.tcpm_prefixlen;
1273                 if (prefixlen > 32)
1274                         return -EINVAL;
1275         }
1276
1277         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1278             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1279                 struct net_device *dev;
1280
1281                 rcu_read_lock();
1282                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1283                 if (dev && netif_is_l3_master(dev))
1284                         l3index = dev->ifindex;
1285
1286                 rcu_read_unlock();
1287
1288                 /* ok to reference set/not set outside of rcu;
1289                  * right now device MUST be an L3 master
1290                  */
1291                 if (!dev || !l3index)
1292                         return -EINVAL;
1293         }
1294
1295         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1296
1297         if (!cmd.tcpm_keylen)
1298                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1299
1300         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1301                 return -EINVAL;
1302
1303         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1304                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1305 }
1306
1307 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1308                                    __be32 daddr, __be32 saddr,
1309                                    const struct tcphdr *th, int nbytes)
1310 {
1311         struct tcp4_pseudohdr *bp;
1312         struct scatterlist sg;
1313         struct tcphdr *_th;
1314
1315         bp = hp->scratch;
1316         bp->saddr = saddr;
1317         bp->daddr = daddr;
1318         bp->pad = 0;
1319         bp->protocol = IPPROTO_TCP;
1320         bp->len = cpu_to_be16(nbytes);
1321
1322         _th = (struct tcphdr *)(bp + 1);
1323         memcpy(_th, th, sizeof(*th));
1324         _th->check = 0;
1325
1326         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1327         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1328                                 sizeof(*bp) + sizeof(*th));
1329         return crypto_ahash_update(hp->md5_req);
1330 }
1331
1332 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1333                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1334 {
1335         struct tcp_md5sig_pool *hp;
1336         struct ahash_request *req;
1337
1338         hp = tcp_get_md5sig_pool();
1339         if (!hp)
1340                 goto clear_hash_noput;
1341         req = hp->md5_req;
1342
1343         if (crypto_ahash_init(req))
1344                 goto clear_hash;
1345         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1346                 goto clear_hash;
1347         if (tcp_md5_hash_key(hp, key))
1348                 goto clear_hash;
1349         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1350         if (crypto_ahash_final(req))
1351                 goto clear_hash;
1352
1353         tcp_put_md5sig_pool();
1354         return 0;
1355
1356 clear_hash:
1357         tcp_put_md5sig_pool();
1358 clear_hash_noput:
1359         memset(md5_hash, 0, 16);
1360         return 1;
1361 }
1362
1363 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1364                         const struct sock *sk,
1365                         const struct sk_buff *skb)
1366 {
1367         struct tcp_md5sig_pool *hp;
1368         struct ahash_request *req;
1369         const struct tcphdr *th = tcp_hdr(skb);
1370         __be32 saddr, daddr;
1371
1372         if (sk) { /* valid for establish/request sockets */
1373                 saddr = sk->sk_rcv_saddr;
1374                 daddr = sk->sk_daddr;
1375         } else {
1376                 const struct iphdr *iph = ip_hdr(skb);
1377                 saddr = iph->saddr;
1378                 daddr = iph->daddr;
1379         }
1380
1381         hp = tcp_get_md5sig_pool();
1382         if (!hp)
1383                 goto clear_hash_noput;
1384         req = hp->md5_req;
1385
1386         if (crypto_ahash_init(req))
1387                 goto clear_hash;
1388
1389         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1390                 goto clear_hash;
1391         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1392                 goto clear_hash;
1393         if (tcp_md5_hash_key(hp, key))
1394                 goto clear_hash;
1395         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1396         if (crypto_ahash_final(req))
1397                 goto clear_hash;
1398
1399         tcp_put_md5sig_pool();
1400         return 0;
1401
1402 clear_hash:
1403         tcp_put_md5sig_pool();
1404 clear_hash_noput:
1405         memset(md5_hash, 0, 16);
1406         return 1;
1407 }
1408 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1409
1410 #endif
1411
1412 static void tcp_v4_init_req(struct request_sock *req,
1413                             const struct sock *sk_listener,
1414                             struct sk_buff *skb)
1415 {
1416         struct inet_request_sock *ireq = inet_rsk(req);
1417         struct net *net = sock_net(sk_listener);
1418
1419         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1420         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1421         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1422 }
1423
1424 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1425                                           struct sk_buff *skb,
1426                                           struct flowi *fl,
1427                                           struct request_sock *req)
1428 {
1429         tcp_v4_init_req(req, sk, skb);
1430
1431         if (security_inet_conn_request(sk, skb, req))
1432                 return NULL;
1433
1434         return inet_csk_route_req(sk, &fl->u.ip4, req);
1435 }
1436
1437 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1438         .family         =       PF_INET,
1439         .obj_size       =       sizeof(struct tcp_request_sock),
1440         .rtx_syn_ack    =       tcp_rtx_synack,
1441         .send_ack       =       tcp_v4_reqsk_send_ack,
1442         .destructor     =       tcp_v4_reqsk_destructor,
1443         .send_reset     =       tcp_v4_send_reset,
1444         .syn_ack_timeout =      tcp_syn_ack_timeout,
1445 };
1446
1447 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1448         .mss_clamp      =       TCP_MSS_DEFAULT,
1449 #ifdef CONFIG_TCP_MD5SIG
1450         .req_md5_lookup =       tcp_v4_md5_lookup,
1451         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1452 #endif
1453 #ifdef CONFIG_SYN_COOKIES
1454         .cookie_init_seq =      cookie_v4_init_sequence,
1455 #endif
1456         .route_req      =       tcp_v4_route_req,
1457         .init_seq       =       tcp_v4_init_seq,
1458         .init_ts_off    =       tcp_v4_init_ts_off,
1459         .send_synack    =       tcp_v4_send_synack,
1460 };
1461
1462 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1463 {
1464         /* Never answer to SYNs send to broadcast or multicast */
1465         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1466                 goto drop;
1467
1468         return tcp_conn_request(&tcp_request_sock_ops,
1469                                 &tcp_request_sock_ipv4_ops, sk, skb);
1470
1471 drop:
1472         tcp_listendrop(sk);
1473         return 0;
1474 }
1475 EXPORT_SYMBOL(tcp_v4_conn_request);
1476
1477
1478 /*
1479  * The three way handshake has completed - we got a valid synack -
1480  * now create the new socket.
1481  */
1482 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1483                                   struct request_sock *req,
1484                                   struct dst_entry *dst,
1485                                   struct request_sock *req_unhash,
1486                                   bool *own_req)
1487 {
1488         struct inet_request_sock *ireq;
1489         bool found_dup_sk = false;
1490         struct inet_sock *newinet;
1491         struct tcp_sock *newtp;
1492         struct sock *newsk;
1493 #ifdef CONFIG_TCP_MD5SIG
1494         const union tcp_md5_addr *addr;
1495         struct tcp_md5sig_key *key;
1496         int l3index;
1497 #endif
1498         struct ip_options_rcu *inet_opt;
1499
1500         if (sk_acceptq_is_full(sk))
1501                 goto exit_overflow;
1502
1503         newsk = tcp_create_openreq_child(sk, req, skb);
1504         if (!newsk)
1505                 goto exit_nonewsk;
1506
1507         newsk->sk_gso_type = SKB_GSO_TCPV4;
1508         inet_sk_rx_dst_set(newsk, skb);
1509
1510         newtp                 = tcp_sk(newsk);
1511         newinet               = inet_sk(newsk);
1512         ireq                  = inet_rsk(req);
1513         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1514         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1515         newsk->sk_bound_dev_if = ireq->ir_iif;
1516         newinet->inet_saddr   = ireq->ir_loc_addr;
1517         inet_opt              = rcu_dereference(ireq->ireq_opt);
1518         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1519         newinet->mc_index     = inet_iif(skb);
1520         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1521         newinet->rcv_tos      = ip_hdr(skb)->tos;
1522         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1523         if (inet_opt)
1524                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1525         newinet->inet_id = prandom_u32();
1526
1527         /* Set ToS of the new socket based upon the value of incoming SYN.
1528          * ECT bits are set later in tcp_init_transfer().
1529          */
1530         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1531                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1532
1533         if (!dst) {
1534                 dst = inet_csk_route_child_sock(sk, newsk, req);
1535                 if (!dst)
1536                         goto put_and_exit;
1537         } else {
1538                 /* syncookie case : see end of cookie_v4_check() */
1539         }
1540         sk_setup_caps(newsk, dst);
1541
1542         tcp_ca_openreq_child(newsk, dst);
1543
1544         tcp_sync_mss(newsk, dst_mtu(dst));
1545         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1546
1547         tcp_initialize_rcv_mss(newsk);
1548
1549 #ifdef CONFIG_TCP_MD5SIG
1550         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1551         /* Copy over the MD5 key from the original socket */
1552         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1553         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1554         if (key) {
1555                 /*
1556                  * We're using one, so create a matching key
1557                  * on the newsk structure. If we fail to get
1558                  * memory, then we end up not copying the key
1559                  * across. Shucks.
1560                  */
1561                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1562                                key->key, key->keylen, GFP_ATOMIC);
1563                 sk_gso_disable(newsk);
1564         }
1565 #endif
1566
1567         if (__inet_inherit_port(sk, newsk) < 0)
1568                 goto put_and_exit;
1569         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1570                                        &found_dup_sk);
1571         if (likely(*own_req)) {
1572                 tcp_move_syn(newtp, req);
1573                 ireq->ireq_opt = NULL;
1574         } else {
1575                 newinet->inet_opt = NULL;
1576
1577                 if (!req_unhash && found_dup_sk) {
1578                         /* This code path should only be executed in the
1579                          * syncookie case only
1580                          */
1581                         bh_unlock_sock(newsk);
1582                         sock_put(newsk);
1583                         newsk = NULL;
1584                 }
1585         }
1586         return newsk;
1587
1588 exit_overflow:
1589         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1590 exit_nonewsk:
1591         dst_release(dst);
1592 exit:
1593         tcp_listendrop(sk);
1594         return NULL;
1595 put_and_exit:
1596         newinet->inet_opt = NULL;
1597         inet_csk_prepare_forced_close(newsk);
1598         tcp_done(newsk);
1599         goto exit;
1600 }
1601 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1602
1603 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1604 {
1605 #ifdef CONFIG_SYN_COOKIES
1606         const struct tcphdr *th = tcp_hdr(skb);
1607
1608         if (!th->syn)
1609                 sk = cookie_v4_check(sk, skb);
1610 #endif
1611         return sk;
1612 }
1613
1614 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1615                          struct tcphdr *th, u32 *cookie)
1616 {
1617         u16 mss = 0;
1618 #ifdef CONFIG_SYN_COOKIES
1619         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1620                                     &tcp_request_sock_ipv4_ops, sk, th);
1621         if (mss) {
1622                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1623                 tcp_synq_overflow(sk);
1624         }
1625 #endif
1626         return mss;
1627 }
1628
1629 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1630                                                            u32));
1631 /* The socket must have it's spinlock held when we get
1632  * here, unless it is a TCP_LISTEN socket.
1633  *
1634  * We have a potential double-lock case here, so even when
1635  * doing backlog processing we use the BH locking scheme.
1636  * This is because we cannot sleep with the original spinlock
1637  * held.
1638  */
1639 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1640 {
1641         enum skb_drop_reason reason;
1642         struct sock *rsk;
1643
1644         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1645                 struct dst_entry *dst;
1646
1647                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1648                                                 lockdep_sock_is_held(sk));
1649
1650                 sock_rps_save_rxhash(sk, skb);
1651                 sk_mark_napi_id(sk, skb);
1652                 if (dst) {
1653                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1654                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1655                                              dst, 0)) {
1656                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1657                                 dst_release(dst);
1658                         }
1659                 }
1660                 tcp_rcv_established(sk, skb);
1661                 return 0;
1662         }
1663
1664         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1665         if (tcp_checksum_complete(skb))
1666                 goto csum_err;
1667
1668         if (sk->sk_state == TCP_LISTEN) {
1669                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1670
1671                 if (!nsk)
1672                         goto discard;
1673                 if (nsk != sk) {
1674                         if (tcp_child_process(sk, nsk, skb)) {
1675                                 rsk = nsk;
1676                                 goto reset;
1677                         }
1678                         return 0;
1679                 }
1680         } else
1681                 sock_rps_save_rxhash(sk, skb);
1682
1683         if (tcp_rcv_state_process(sk, skb)) {
1684                 rsk = sk;
1685                 goto reset;
1686         }
1687         return 0;
1688
1689 reset:
1690         tcp_v4_send_reset(rsk, skb);
1691 discard:
1692         kfree_skb_reason(skb, reason);
1693         /* Be careful here. If this function gets more complicated and
1694          * gcc suffers from register pressure on the x86, sk (in %ebx)
1695          * might be destroyed here. This current version compiles correctly,
1696          * but you have been warned.
1697          */
1698         return 0;
1699
1700 csum_err:
1701         reason = SKB_DROP_REASON_TCP_CSUM;
1702         trace_tcp_bad_csum(skb);
1703         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1704         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1705         goto discard;
1706 }
1707 EXPORT_SYMBOL(tcp_v4_do_rcv);
1708
1709 int tcp_v4_early_demux(struct sk_buff *skb)
1710 {
1711         const struct iphdr *iph;
1712         const struct tcphdr *th;
1713         struct sock *sk;
1714
1715         if (skb->pkt_type != PACKET_HOST)
1716                 return 0;
1717
1718         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1719                 return 0;
1720
1721         iph = ip_hdr(skb);
1722         th = tcp_hdr(skb);
1723
1724         if (th->doff < sizeof(struct tcphdr) / 4)
1725                 return 0;
1726
1727         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1728                                        iph->saddr, th->source,
1729                                        iph->daddr, ntohs(th->dest),
1730                                        skb->skb_iif, inet_sdif(skb));
1731         if (sk) {
1732                 skb->sk = sk;
1733                 skb->destructor = sock_edemux;
1734                 if (sk_fullsock(sk)) {
1735                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1736
1737                         if (dst)
1738                                 dst = dst_check(dst, 0);
1739                         if (dst &&
1740                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1741                                 skb_dst_set_noref(skb, dst);
1742                 }
1743         }
1744         return 0;
1745 }
1746
1747 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1748                      enum skb_drop_reason *reason)
1749 {
1750         u32 limit, tail_gso_size, tail_gso_segs;
1751         struct skb_shared_info *shinfo;
1752         const struct tcphdr *th;
1753         struct tcphdr *thtail;
1754         struct sk_buff *tail;
1755         unsigned int hdrlen;
1756         bool fragstolen;
1757         u32 gso_segs;
1758         u32 gso_size;
1759         int delta;
1760
1761         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1762          * we can fix skb->truesize to its real value to avoid future drops.
1763          * This is valid because skb is not yet charged to the socket.
1764          * It has been noticed pure SACK packets were sometimes dropped
1765          * (if cooked by drivers without copybreak feature).
1766          */
1767         skb_condense(skb);
1768
1769         skb_dst_drop(skb);
1770
1771         if (unlikely(tcp_checksum_complete(skb))) {
1772                 bh_unlock_sock(sk);
1773                 trace_tcp_bad_csum(skb);
1774                 *reason = SKB_DROP_REASON_TCP_CSUM;
1775                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1776                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1777                 return true;
1778         }
1779
1780         /* Attempt coalescing to last skb in backlog, even if we are
1781          * above the limits.
1782          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1783          */
1784         th = (const struct tcphdr *)skb->data;
1785         hdrlen = th->doff * 4;
1786
1787         tail = sk->sk_backlog.tail;
1788         if (!tail)
1789                 goto no_coalesce;
1790         thtail = (struct tcphdr *)tail->data;
1791
1792         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1793             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1794             ((TCP_SKB_CB(tail)->tcp_flags |
1795               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1796             !((TCP_SKB_CB(tail)->tcp_flags &
1797               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1798             ((TCP_SKB_CB(tail)->tcp_flags ^
1799               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1800 #ifdef CONFIG_TLS_DEVICE
1801             tail->decrypted != skb->decrypted ||
1802 #endif
1803             thtail->doff != th->doff ||
1804             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1805                 goto no_coalesce;
1806
1807         __skb_pull(skb, hdrlen);
1808
1809         shinfo = skb_shinfo(skb);
1810         gso_size = shinfo->gso_size ?: skb->len;
1811         gso_segs = shinfo->gso_segs ?: 1;
1812
1813         shinfo = skb_shinfo(tail);
1814         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1815         tail_gso_segs = shinfo->gso_segs ?: 1;
1816
1817         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1818                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1819
1820                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1821                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1822                         thtail->window = th->window;
1823                 }
1824
1825                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1826                  * thtail->fin, so that the fast path in tcp_rcv_established()
1827                  * is not entered if we append a packet with a FIN.
1828                  * SYN, RST, URG are not present.
1829                  * ACK is set on both packets.
1830                  * PSH : we do not really care in TCP stack,
1831                  *       at least for 'GRO' packets.
1832                  */
1833                 thtail->fin |= th->fin;
1834                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1835
1836                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1837                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1838                         tail->tstamp = skb->tstamp;
1839                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1840                 }
1841
1842                 /* Not as strict as GRO. We only need to carry mss max value */
1843                 shinfo->gso_size = max(gso_size, tail_gso_size);
1844                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1845
1846                 sk->sk_backlog.len += delta;
1847                 __NET_INC_STATS(sock_net(sk),
1848                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1849                 kfree_skb_partial(skb, fragstolen);
1850                 return false;
1851         }
1852         __skb_push(skb, hdrlen);
1853
1854 no_coalesce:
1855         /* Only socket owner can try to collapse/prune rx queues
1856          * to reduce memory overhead, so add a little headroom here.
1857          * Few sockets backlog are possibly concurrently non empty.
1858          */
1859         limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1860
1861         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1862                 bh_unlock_sock(sk);
1863                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1864                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1865                 return true;
1866         }
1867         return false;
1868 }
1869 EXPORT_SYMBOL(tcp_add_backlog);
1870
1871 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1872 {
1873         struct tcphdr *th = (struct tcphdr *)skb->data;
1874
1875         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1876 }
1877 EXPORT_SYMBOL(tcp_filter);
1878
1879 static void tcp_v4_restore_cb(struct sk_buff *skb)
1880 {
1881         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1882                 sizeof(struct inet_skb_parm));
1883 }
1884
1885 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1886                            const struct tcphdr *th)
1887 {
1888         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1889          * barrier() makes sure compiler wont play fool^Waliasing games.
1890          */
1891         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1892                 sizeof(struct inet_skb_parm));
1893         barrier();
1894
1895         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1896         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1897                                     skb->len - th->doff * 4);
1898         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1899         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1900         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1901         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1902         TCP_SKB_CB(skb)->sacked  = 0;
1903         TCP_SKB_CB(skb)->has_rxtstamp =
1904                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1905 }
1906
1907 /*
1908  *      From tcp_input.c
1909  */
1910
1911 int tcp_v4_rcv(struct sk_buff *skb)
1912 {
1913         struct net *net = dev_net(skb->dev);
1914         enum skb_drop_reason drop_reason;
1915         int sdif = inet_sdif(skb);
1916         int dif = inet_iif(skb);
1917         const struct iphdr *iph;
1918         const struct tcphdr *th;
1919         bool refcounted;
1920         struct sock *sk;
1921         int ret;
1922
1923         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1924         if (skb->pkt_type != PACKET_HOST)
1925                 goto discard_it;
1926
1927         /* Count it even if it's bad */
1928         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1929
1930         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1931                 goto discard_it;
1932
1933         th = (const struct tcphdr *)skb->data;
1934
1935         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1936                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1937                 goto bad_packet;
1938         }
1939         if (!pskb_may_pull(skb, th->doff * 4))
1940                 goto discard_it;
1941
1942         /* An explanation is required here, I think.
1943          * Packet length and doff are validated by header prediction,
1944          * provided case of th->doff==0 is eliminated.
1945          * So, we defer the checks. */
1946
1947         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1948                 goto csum_error;
1949
1950         th = (const struct tcphdr *)skb->data;
1951         iph = ip_hdr(skb);
1952 lookup:
1953         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1954                                th->dest, sdif, &refcounted);
1955         if (!sk)
1956                 goto no_tcp_socket;
1957
1958 process:
1959         if (sk->sk_state == TCP_TIME_WAIT)
1960                 goto do_time_wait;
1961
1962         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1963                 struct request_sock *req = inet_reqsk(sk);
1964                 bool req_stolen = false;
1965                 struct sock *nsk;
1966
1967                 sk = req->rsk_listener;
1968                 drop_reason = tcp_inbound_md5_hash(sk, skb,
1969                                                    &iph->saddr, &iph->daddr,
1970                                                    AF_INET, dif, sdif);
1971                 if (unlikely(drop_reason)) {
1972                         sk_drops_add(sk, skb);
1973                         reqsk_put(req);
1974                         goto discard_it;
1975                 }
1976                 if (tcp_checksum_complete(skb)) {
1977                         reqsk_put(req);
1978                         goto csum_error;
1979                 }
1980                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1981                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1982                         if (!nsk) {
1983                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
1984                                 goto lookup;
1985                         }
1986                         sk = nsk;
1987                         /* reuseport_migrate_sock() has already held one sk_refcnt
1988                          * before returning.
1989                          */
1990                 } else {
1991                         /* We own a reference on the listener, increase it again
1992                          * as we might lose it too soon.
1993                          */
1994                         sock_hold(sk);
1995                 }
1996                 refcounted = true;
1997                 nsk = NULL;
1998                 if (!tcp_filter(sk, skb)) {
1999                         th = (const struct tcphdr *)skb->data;
2000                         iph = ip_hdr(skb);
2001                         tcp_v4_fill_cb(skb, iph, th);
2002                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2003                 } else {
2004                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2005                 }
2006                 if (!nsk) {
2007                         reqsk_put(req);
2008                         if (req_stolen) {
2009                                 /* Another cpu got exclusive access to req
2010                                  * and created a full blown socket.
2011                                  * Try to feed this packet to this socket
2012                                  * instead of discarding it.
2013                                  */
2014                                 tcp_v4_restore_cb(skb);
2015                                 sock_put(sk);
2016                                 goto lookup;
2017                         }
2018                         goto discard_and_relse;
2019                 }
2020                 if (nsk == sk) {
2021                         reqsk_put(req);
2022                         tcp_v4_restore_cb(skb);
2023                 } else if (tcp_child_process(sk, nsk, skb)) {
2024                         tcp_v4_send_reset(nsk, skb);
2025                         goto discard_and_relse;
2026                 } else {
2027                         sock_put(sk);
2028                         return 0;
2029                 }
2030         }
2031
2032         if (static_branch_unlikely(&ip4_min_ttl)) {
2033                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2034                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2035                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2036                         goto discard_and_relse;
2037                 }
2038         }
2039
2040         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2041                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2042                 goto discard_and_relse;
2043         }
2044
2045         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2046                                            &iph->daddr, AF_INET, dif, sdif);
2047         if (drop_reason)
2048                 goto discard_and_relse;
2049
2050         nf_reset_ct(skb);
2051
2052         if (tcp_filter(sk, skb)) {
2053                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2054                 goto discard_and_relse;
2055         }
2056         th = (const struct tcphdr *)skb->data;
2057         iph = ip_hdr(skb);
2058         tcp_v4_fill_cb(skb, iph, th);
2059
2060         skb->dev = NULL;
2061
2062         if (sk->sk_state == TCP_LISTEN) {
2063                 ret = tcp_v4_do_rcv(sk, skb);
2064                 goto put_and_return;
2065         }
2066
2067         sk_incoming_cpu_update(sk);
2068
2069         sk_defer_free_flush(sk);
2070         bh_lock_sock_nested(sk);
2071         tcp_segs_in(tcp_sk(sk), skb);
2072         ret = 0;
2073         if (!sock_owned_by_user(sk)) {
2074                 ret = tcp_v4_do_rcv(sk, skb);
2075         } else {
2076                 if (tcp_add_backlog(sk, skb, &drop_reason))
2077                         goto discard_and_relse;
2078         }
2079         bh_unlock_sock(sk);
2080
2081 put_and_return:
2082         if (refcounted)
2083                 sock_put(sk);
2084
2085         return ret;
2086
2087 no_tcp_socket:
2088         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2089         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2090                 goto discard_it;
2091
2092         tcp_v4_fill_cb(skb, iph, th);
2093
2094         if (tcp_checksum_complete(skb)) {
2095 csum_error:
2096                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2097                 trace_tcp_bad_csum(skb);
2098                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2099 bad_packet:
2100                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2101         } else {
2102                 tcp_v4_send_reset(NULL, skb);
2103         }
2104
2105 discard_it:
2106         /* Discard frame. */
2107         kfree_skb_reason(skb, drop_reason);
2108         return 0;
2109
2110 discard_and_relse:
2111         sk_drops_add(sk, skb);
2112         if (refcounted)
2113                 sock_put(sk);
2114         goto discard_it;
2115
2116 do_time_wait:
2117         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2118                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2119                 inet_twsk_put(inet_twsk(sk));
2120                 goto discard_it;
2121         }
2122
2123         tcp_v4_fill_cb(skb, iph, th);
2124
2125         if (tcp_checksum_complete(skb)) {
2126                 inet_twsk_put(inet_twsk(sk));
2127                 goto csum_error;
2128         }
2129         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2130         case TCP_TW_SYN: {
2131                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2132                                                         &tcp_hashinfo, skb,
2133                                                         __tcp_hdrlen(th),
2134                                                         iph->saddr, th->source,
2135                                                         iph->daddr, th->dest,
2136                                                         inet_iif(skb),
2137                                                         sdif);
2138                 if (sk2) {
2139                         inet_twsk_deschedule_put(inet_twsk(sk));
2140                         sk = sk2;
2141                         tcp_v4_restore_cb(skb);
2142                         refcounted = false;
2143                         goto process;
2144                 }
2145         }
2146                 /* to ACK */
2147                 fallthrough;
2148         case TCP_TW_ACK:
2149                 tcp_v4_timewait_ack(sk, skb);
2150                 break;
2151         case TCP_TW_RST:
2152                 tcp_v4_send_reset(sk, skb);
2153                 inet_twsk_deschedule_put(inet_twsk(sk));
2154                 goto discard_it;
2155         case TCP_TW_SUCCESS:;
2156         }
2157         goto discard_it;
2158 }
2159
2160 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2161         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2162         .twsk_unique    = tcp_twsk_unique,
2163         .twsk_destructor= tcp_twsk_destructor,
2164 };
2165
2166 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2167 {
2168         struct dst_entry *dst = skb_dst(skb);
2169
2170         if (dst && dst_hold_safe(dst)) {
2171                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2172                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2173         }
2174 }
2175 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2176
2177 const struct inet_connection_sock_af_ops ipv4_specific = {
2178         .queue_xmit        = ip_queue_xmit,
2179         .send_check        = tcp_v4_send_check,
2180         .rebuild_header    = inet_sk_rebuild_header,
2181         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2182         .conn_request      = tcp_v4_conn_request,
2183         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2184         .net_header_len    = sizeof(struct iphdr),
2185         .setsockopt        = ip_setsockopt,
2186         .getsockopt        = ip_getsockopt,
2187         .addr2sockaddr     = inet_csk_addr2sockaddr,
2188         .sockaddr_len      = sizeof(struct sockaddr_in),
2189         .mtu_reduced       = tcp_v4_mtu_reduced,
2190 };
2191 EXPORT_SYMBOL(ipv4_specific);
2192
2193 #ifdef CONFIG_TCP_MD5SIG
2194 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2195         .md5_lookup             = tcp_v4_md5_lookup,
2196         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2197         .md5_parse              = tcp_v4_parse_md5_keys,
2198 };
2199 #endif
2200
2201 /* NOTE: A lot of things set to zero explicitly by call to
2202  *       sk_alloc() so need not be done here.
2203  */
2204 static int tcp_v4_init_sock(struct sock *sk)
2205 {
2206         struct inet_connection_sock *icsk = inet_csk(sk);
2207
2208         tcp_init_sock(sk);
2209
2210         icsk->icsk_af_ops = &ipv4_specific;
2211
2212 #ifdef CONFIG_TCP_MD5SIG
2213         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2214 #endif
2215
2216         return 0;
2217 }
2218
2219 void tcp_v4_destroy_sock(struct sock *sk)
2220 {
2221         struct tcp_sock *tp = tcp_sk(sk);
2222
2223         trace_tcp_destroy_sock(sk);
2224
2225         tcp_clear_xmit_timers(sk);
2226
2227         tcp_cleanup_congestion_control(sk);
2228
2229         tcp_cleanup_ulp(sk);
2230
2231         /* Cleanup up the write buffer. */
2232         tcp_write_queue_purge(sk);
2233
2234         /* Check if we want to disable active TFO */
2235         tcp_fastopen_active_disable_ofo_check(sk);
2236
2237         /* Cleans up our, hopefully empty, out_of_order_queue. */
2238         skb_rbtree_purge(&tp->out_of_order_queue);
2239
2240 #ifdef CONFIG_TCP_MD5SIG
2241         /* Clean up the MD5 key list, if any */
2242         if (tp->md5sig_info) {
2243                 tcp_clear_md5_list(sk);
2244                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2245                 tp->md5sig_info = NULL;
2246         }
2247 #endif
2248
2249         /* Clean up a referenced TCP bind bucket. */
2250         if (inet_csk(sk)->icsk_bind_hash)
2251                 inet_put_port(sk);
2252
2253         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2254
2255         /* If socket is aborted during connect operation */
2256         tcp_free_fastopen_req(tp);
2257         tcp_fastopen_destroy_cipher(sk);
2258         tcp_saved_syn_free(tp);
2259
2260         sk_sockets_allocated_dec(sk);
2261 }
2262 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2263
2264 #ifdef CONFIG_PROC_FS
2265 /* Proc filesystem TCP sock list dumping. */
2266
2267 static unsigned short seq_file_family(const struct seq_file *seq);
2268
2269 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2270 {
2271         unsigned short family = seq_file_family(seq);
2272
2273         /* AF_UNSPEC is used as a match all */
2274         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2275                 net_eq(sock_net(sk), seq_file_net(seq)));
2276 }
2277
2278 /* Find a non empty bucket (starting from st->bucket)
2279  * and return the first sk from it.
2280  */
2281 static void *listening_get_first(struct seq_file *seq)
2282 {
2283         struct tcp_iter_state *st = seq->private;
2284
2285         st->offset = 0;
2286         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2287                 struct inet_listen_hashbucket *ilb2;
2288                 struct inet_connection_sock *icsk;
2289                 struct sock *sk;
2290
2291                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2292                 if (hlist_empty(&ilb2->head))
2293                         continue;
2294
2295                 spin_lock(&ilb2->lock);
2296                 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2297                         sk = (struct sock *)icsk;
2298                         if (seq_sk_match(seq, sk))
2299                                 return sk;
2300                 }
2301                 spin_unlock(&ilb2->lock);
2302         }
2303
2304         return NULL;
2305 }
2306
2307 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2308  * If "cur" is the last one in the st->bucket,
2309  * call listening_get_first() to return the first sk of the next
2310  * non empty bucket.
2311  */
2312 static void *listening_get_next(struct seq_file *seq, void *cur)
2313 {
2314         struct tcp_iter_state *st = seq->private;
2315         struct inet_listen_hashbucket *ilb2;
2316         struct inet_connection_sock *icsk;
2317         struct sock *sk = cur;
2318
2319         ++st->num;
2320         ++st->offset;
2321
2322         icsk = inet_csk(sk);
2323         inet_lhash2_for_each_icsk_continue(icsk) {
2324                 sk = (struct sock *)icsk;
2325                 if (seq_sk_match(seq, sk))
2326                         return sk;
2327         }
2328
2329         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2330         spin_unlock(&ilb2->lock);
2331         ++st->bucket;
2332         return listening_get_first(seq);
2333 }
2334
2335 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2336 {
2337         struct tcp_iter_state *st = seq->private;
2338         void *rc;
2339
2340         st->bucket = 0;
2341         st->offset = 0;
2342         rc = listening_get_first(seq);
2343
2344         while (rc && *pos) {
2345                 rc = listening_get_next(seq, rc);
2346                 --*pos;
2347         }
2348         return rc;
2349 }
2350
2351 static inline bool empty_bucket(const struct tcp_iter_state *st)
2352 {
2353         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2354 }
2355
2356 /*
2357  * Get first established socket starting from bucket given in st->bucket.
2358  * If st->bucket is zero, the very first socket in the hash is returned.
2359  */
2360 static void *established_get_first(struct seq_file *seq)
2361 {
2362         struct tcp_iter_state *st = seq->private;
2363
2364         st->offset = 0;
2365         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2366                 struct sock *sk;
2367                 struct hlist_nulls_node *node;
2368                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2369
2370                 /* Lockless fast path for the common case of empty buckets */
2371                 if (empty_bucket(st))
2372                         continue;
2373
2374                 spin_lock_bh(lock);
2375                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2376                         if (seq_sk_match(seq, sk))
2377                                 return sk;
2378                 }
2379                 spin_unlock_bh(lock);
2380         }
2381
2382         return NULL;
2383 }
2384
2385 static void *established_get_next(struct seq_file *seq, void *cur)
2386 {
2387         struct sock *sk = cur;
2388         struct hlist_nulls_node *node;
2389         struct tcp_iter_state *st = seq->private;
2390
2391         ++st->num;
2392         ++st->offset;
2393
2394         sk = sk_nulls_next(sk);
2395
2396         sk_nulls_for_each_from(sk, node) {
2397                 if (seq_sk_match(seq, sk))
2398                         return sk;
2399         }
2400
2401         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2402         ++st->bucket;
2403         return established_get_first(seq);
2404 }
2405
2406 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2407 {
2408         struct tcp_iter_state *st = seq->private;
2409         void *rc;
2410
2411         st->bucket = 0;
2412         rc = established_get_first(seq);
2413
2414         while (rc && pos) {
2415                 rc = established_get_next(seq, rc);
2416                 --pos;
2417         }
2418         return rc;
2419 }
2420
2421 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2422 {
2423         void *rc;
2424         struct tcp_iter_state *st = seq->private;
2425
2426         st->state = TCP_SEQ_STATE_LISTENING;
2427         rc        = listening_get_idx(seq, &pos);
2428
2429         if (!rc) {
2430                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2431                 rc        = established_get_idx(seq, pos);
2432         }
2433
2434         return rc;
2435 }
2436
2437 static void *tcp_seek_last_pos(struct seq_file *seq)
2438 {
2439         struct tcp_iter_state *st = seq->private;
2440         int bucket = st->bucket;
2441         int offset = st->offset;
2442         int orig_num = st->num;
2443         void *rc = NULL;
2444
2445         switch (st->state) {
2446         case TCP_SEQ_STATE_LISTENING:
2447                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2448                         break;
2449                 st->state = TCP_SEQ_STATE_LISTENING;
2450                 rc = listening_get_first(seq);
2451                 while (offset-- && rc && bucket == st->bucket)
2452                         rc = listening_get_next(seq, rc);
2453                 if (rc)
2454                         break;
2455                 st->bucket = 0;
2456                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2457                 fallthrough;
2458         case TCP_SEQ_STATE_ESTABLISHED:
2459                 if (st->bucket > tcp_hashinfo.ehash_mask)
2460                         break;
2461                 rc = established_get_first(seq);
2462                 while (offset-- && rc && bucket == st->bucket)
2463                         rc = established_get_next(seq, rc);
2464         }
2465
2466         st->num = orig_num;
2467
2468         return rc;
2469 }
2470
2471 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2472 {
2473         struct tcp_iter_state *st = seq->private;
2474         void *rc;
2475
2476         if (*pos && *pos == st->last_pos) {
2477                 rc = tcp_seek_last_pos(seq);
2478                 if (rc)
2479                         goto out;
2480         }
2481
2482         st->state = TCP_SEQ_STATE_LISTENING;
2483         st->num = 0;
2484         st->bucket = 0;
2485         st->offset = 0;
2486         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2487
2488 out:
2489         st->last_pos = *pos;
2490         return rc;
2491 }
2492 EXPORT_SYMBOL(tcp_seq_start);
2493
2494 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2495 {
2496         struct tcp_iter_state *st = seq->private;
2497         void *rc = NULL;
2498
2499         if (v == SEQ_START_TOKEN) {
2500                 rc = tcp_get_idx(seq, 0);
2501                 goto out;
2502         }
2503
2504         switch (st->state) {
2505         case TCP_SEQ_STATE_LISTENING:
2506                 rc = listening_get_next(seq, v);
2507                 if (!rc) {
2508                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2509                         st->bucket = 0;
2510                         st->offset = 0;
2511                         rc        = established_get_first(seq);
2512                 }
2513                 break;
2514         case TCP_SEQ_STATE_ESTABLISHED:
2515                 rc = established_get_next(seq, v);
2516                 break;
2517         }
2518 out:
2519         ++*pos;
2520         st->last_pos = *pos;
2521         return rc;
2522 }
2523 EXPORT_SYMBOL(tcp_seq_next);
2524
2525 void tcp_seq_stop(struct seq_file *seq, void *v)
2526 {
2527         struct tcp_iter_state *st = seq->private;
2528
2529         switch (st->state) {
2530         case TCP_SEQ_STATE_LISTENING:
2531                 if (v != SEQ_START_TOKEN)
2532                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2533                 break;
2534         case TCP_SEQ_STATE_ESTABLISHED:
2535                 if (v)
2536                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2537                 break;
2538         }
2539 }
2540 EXPORT_SYMBOL(tcp_seq_stop);
2541
2542 static void get_openreq4(const struct request_sock *req,
2543                          struct seq_file *f, int i)
2544 {
2545         const struct inet_request_sock *ireq = inet_rsk(req);
2546         long delta = req->rsk_timer.expires - jiffies;
2547
2548         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2549                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2550                 i,
2551                 ireq->ir_loc_addr,
2552                 ireq->ir_num,
2553                 ireq->ir_rmt_addr,
2554                 ntohs(ireq->ir_rmt_port),
2555                 TCP_SYN_RECV,
2556                 0, 0, /* could print option size, but that is af dependent. */
2557                 1,    /* timers active (only the expire timer) */
2558                 jiffies_delta_to_clock_t(delta),
2559                 req->num_timeout,
2560                 from_kuid_munged(seq_user_ns(f),
2561                                  sock_i_uid(req->rsk_listener)),
2562                 0,  /* non standard timer */
2563                 0, /* open_requests have no inode */
2564                 0,
2565                 req);
2566 }
2567
2568 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2569 {
2570         int timer_active;
2571         unsigned long timer_expires;
2572         const struct tcp_sock *tp = tcp_sk(sk);
2573         const struct inet_connection_sock *icsk = inet_csk(sk);
2574         const struct inet_sock *inet = inet_sk(sk);
2575         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2576         __be32 dest = inet->inet_daddr;
2577         __be32 src = inet->inet_rcv_saddr;
2578         __u16 destp = ntohs(inet->inet_dport);
2579         __u16 srcp = ntohs(inet->inet_sport);
2580         int rx_queue;
2581         int state;
2582
2583         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2584             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2585             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2586                 timer_active    = 1;
2587                 timer_expires   = icsk->icsk_timeout;
2588         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2589                 timer_active    = 4;
2590                 timer_expires   = icsk->icsk_timeout;
2591         } else if (timer_pending(&sk->sk_timer)) {
2592                 timer_active    = 2;
2593                 timer_expires   = sk->sk_timer.expires;
2594         } else {
2595                 timer_active    = 0;
2596                 timer_expires = jiffies;
2597         }
2598
2599         state = inet_sk_state_load(sk);
2600         if (state == TCP_LISTEN)
2601                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2602         else
2603                 /* Because we don't lock the socket,
2604                  * we might find a transient negative value.
2605                  */
2606                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2607                                       READ_ONCE(tp->copied_seq), 0);
2608
2609         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2610                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2611                 i, src, srcp, dest, destp, state,
2612                 READ_ONCE(tp->write_seq) - tp->snd_una,
2613                 rx_queue,
2614                 timer_active,
2615                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2616                 icsk->icsk_retransmits,
2617                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2618                 icsk->icsk_probes_out,
2619                 sock_i_ino(sk),
2620                 refcount_read(&sk->sk_refcnt), sk,
2621                 jiffies_to_clock_t(icsk->icsk_rto),
2622                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2623                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2624                 tp->snd_cwnd,
2625                 state == TCP_LISTEN ?
2626                     fastopenq->max_qlen :
2627                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2628 }
2629
2630 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2631                                struct seq_file *f, int i)
2632 {
2633         long delta = tw->tw_timer.expires - jiffies;
2634         __be32 dest, src;
2635         __u16 destp, srcp;
2636
2637         dest  = tw->tw_daddr;
2638         src   = tw->tw_rcv_saddr;
2639         destp = ntohs(tw->tw_dport);
2640         srcp  = ntohs(tw->tw_sport);
2641
2642         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2643                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2644                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2645                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2646                 refcount_read(&tw->tw_refcnt), tw);
2647 }
2648
2649 #define TMPSZ 150
2650
2651 static int tcp4_seq_show(struct seq_file *seq, void *v)
2652 {
2653         struct tcp_iter_state *st;
2654         struct sock *sk = v;
2655
2656         seq_setwidth(seq, TMPSZ - 1);
2657         if (v == SEQ_START_TOKEN) {
2658                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2659                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2660                            "inode");
2661                 goto out;
2662         }
2663         st = seq->private;
2664
2665         if (sk->sk_state == TCP_TIME_WAIT)
2666                 get_timewait4_sock(v, seq, st->num);
2667         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2668                 get_openreq4(v, seq, st->num);
2669         else
2670                 get_tcp4_sock(v, seq, st->num);
2671 out:
2672         seq_pad(seq, '\n');
2673         return 0;
2674 }
2675
2676 #ifdef CONFIG_BPF_SYSCALL
2677 struct bpf_tcp_iter_state {
2678         struct tcp_iter_state state;
2679         unsigned int cur_sk;
2680         unsigned int end_sk;
2681         unsigned int max_sk;
2682         struct sock **batch;
2683         bool st_bucket_done;
2684 };
2685
2686 struct bpf_iter__tcp {
2687         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2688         __bpf_md_ptr(struct sock_common *, sk_common);
2689         uid_t uid __aligned(8);
2690 };
2691
2692 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2693                              struct sock_common *sk_common, uid_t uid)
2694 {
2695         struct bpf_iter__tcp ctx;
2696
2697         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2698         ctx.meta = meta;
2699         ctx.sk_common = sk_common;
2700         ctx.uid = uid;
2701         return bpf_iter_run_prog(prog, &ctx);
2702 }
2703
2704 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2705 {
2706         while (iter->cur_sk < iter->end_sk)
2707                 sock_put(iter->batch[iter->cur_sk++]);
2708 }
2709
2710 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2711                                       unsigned int new_batch_sz)
2712 {
2713         struct sock **new_batch;
2714
2715         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2716                              GFP_USER | __GFP_NOWARN);
2717         if (!new_batch)
2718                 return -ENOMEM;
2719
2720         bpf_iter_tcp_put_batch(iter);
2721         kvfree(iter->batch);
2722         iter->batch = new_batch;
2723         iter->max_sk = new_batch_sz;
2724
2725         return 0;
2726 }
2727
2728 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2729                                                  struct sock *start_sk)
2730 {
2731         struct bpf_tcp_iter_state *iter = seq->private;
2732         struct tcp_iter_state *st = &iter->state;
2733         struct inet_connection_sock *icsk;
2734         unsigned int expected = 1;
2735         struct sock *sk;
2736
2737         sock_hold(start_sk);
2738         iter->batch[iter->end_sk++] = start_sk;
2739
2740         icsk = inet_csk(start_sk);
2741         inet_lhash2_for_each_icsk_continue(icsk) {
2742                 sk = (struct sock *)icsk;
2743                 if (seq_sk_match(seq, sk)) {
2744                         if (iter->end_sk < iter->max_sk) {
2745                                 sock_hold(sk);
2746                                 iter->batch[iter->end_sk++] = sk;
2747                         }
2748                         expected++;
2749                 }
2750         }
2751         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2752
2753         return expected;
2754 }
2755
2756 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2757                                                    struct sock *start_sk)
2758 {
2759         struct bpf_tcp_iter_state *iter = seq->private;
2760         struct tcp_iter_state *st = &iter->state;
2761         struct hlist_nulls_node *node;
2762         unsigned int expected = 1;
2763         struct sock *sk;
2764
2765         sock_hold(start_sk);
2766         iter->batch[iter->end_sk++] = start_sk;
2767
2768         sk = sk_nulls_next(start_sk);
2769         sk_nulls_for_each_from(sk, node) {
2770                 if (seq_sk_match(seq, sk)) {
2771                         if (iter->end_sk < iter->max_sk) {
2772                                 sock_hold(sk);
2773                                 iter->batch[iter->end_sk++] = sk;
2774                         }
2775                         expected++;
2776                 }
2777         }
2778         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2779
2780         return expected;
2781 }
2782
2783 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2784 {
2785         struct bpf_tcp_iter_state *iter = seq->private;
2786         struct tcp_iter_state *st = &iter->state;
2787         unsigned int expected;
2788         bool resized = false;
2789         struct sock *sk;
2790
2791         /* The st->bucket is done.  Directly advance to the next
2792          * bucket instead of having the tcp_seek_last_pos() to skip
2793          * one by one in the current bucket and eventually find out
2794          * it has to advance to the next bucket.
2795          */
2796         if (iter->st_bucket_done) {
2797                 st->offset = 0;
2798                 st->bucket++;
2799                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2800                     st->bucket > tcp_hashinfo.lhash2_mask) {
2801                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2802                         st->bucket = 0;
2803                 }
2804         }
2805
2806 again:
2807         /* Get a new batch */
2808         iter->cur_sk = 0;
2809         iter->end_sk = 0;
2810         iter->st_bucket_done = false;
2811
2812         sk = tcp_seek_last_pos(seq);
2813         if (!sk)
2814                 return NULL; /* Done */
2815
2816         if (st->state == TCP_SEQ_STATE_LISTENING)
2817                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2818         else
2819                 expected = bpf_iter_tcp_established_batch(seq, sk);
2820
2821         if (iter->end_sk == expected) {
2822                 iter->st_bucket_done = true;
2823                 return sk;
2824         }
2825
2826         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2827                 resized = true;
2828                 goto again;
2829         }
2830
2831         return sk;
2832 }
2833
2834 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2835 {
2836         /* bpf iter does not support lseek, so it always
2837          * continue from where it was stop()-ped.
2838          */
2839         if (*pos)
2840                 return bpf_iter_tcp_batch(seq);
2841
2842         return SEQ_START_TOKEN;
2843 }
2844
2845 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2846 {
2847         struct bpf_tcp_iter_state *iter = seq->private;
2848         struct tcp_iter_state *st = &iter->state;
2849         struct sock *sk;
2850
2851         /* Whenever seq_next() is called, the iter->cur_sk is
2852          * done with seq_show(), so advance to the next sk in
2853          * the batch.
2854          */
2855         if (iter->cur_sk < iter->end_sk) {
2856                 /* Keeping st->num consistent in tcp_iter_state.
2857                  * bpf_iter_tcp does not use st->num.
2858                  * meta.seq_num is used instead.
2859                  */
2860                 st->num++;
2861                 /* Move st->offset to the next sk in the bucket such that
2862                  * the future start() will resume at st->offset in
2863                  * st->bucket.  See tcp_seek_last_pos().
2864                  */
2865                 st->offset++;
2866                 sock_put(iter->batch[iter->cur_sk++]);
2867         }
2868
2869         if (iter->cur_sk < iter->end_sk)
2870                 sk = iter->batch[iter->cur_sk];
2871         else
2872                 sk = bpf_iter_tcp_batch(seq);
2873
2874         ++*pos;
2875         /* Keeping st->last_pos consistent in tcp_iter_state.
2876          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2877          */
2878         st->last_pos = *pos;
2879         return sk;
2880 }
2881
2882 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2883 {
2884         struct bpf_iter_meta meta;
2885         struct bpf_prog *prog;
2886         struct sock *sk = v;
2887         bool slow;
2888         uid_t uid;
2889         int ret;
2890
2891         if (v == SEQ_START_TOKEN)
2892                 return 0;
2893
2894         if (sk_fullsock(sk))
2895                 slow = lock_sock_fast(sk);
2896
2897         if (unlikely(sk_unhashed(sk))) {
2898                 ret = SEQ_SKIP;
2899                 goto unlock;
2900         }
2901
2902         if (sk->sk_state == TCP_TIME_WAIT) {
2903                 uid = 0;
2904         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2905                 const struct request_sock *req = v;
2906
2907                 uid = from_kuid_munged(seq_user_ns(seq),
2908                                        sock_i_uid(req->rsk_listener));
2909         } else {
2910                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2911         }
2912
2913         meta.seq = seq;
2914         prog = bpf_iter_get_info(&meta, false);
2915         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2916
2917 unlock:
2918         if (sk_fullsock(sk))
2919                 unlock_sock_fast(sk, slow);
2920         return ret;
2921
2922 }
2923
2924 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2925 {
2926         struct bpf_tcp_iter_state *iter = seq->private;
2927         struct bpf_iter_meta meta;
2928         struct bpf_prog *prog;
2929
2930         if (!v) {
2931                 meta.seq = seq;
2932                 prog = bpf_iter_get_info(&meta, true);
2933                 if (prog)
2934                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2935         }
2936
2937         if (iter->cur_sk < iter->end_sk) {
2938                 bpf_iter_tcp_put_batch(iter);
2939                 iter->st_bucket_done = false;
2940         }
2941 }
2942
2943 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2944         .show           = bpf_iter_tcp_seq_show,
2945         .start          = bpf_iter_tcp_seq_start,
2946         .next           = bpf_iter_tcp_seq_next,
2947         .stop           = bpf_iter_tcp_seq_stop,
2948 };
2949 #endif
2950 static unsigned short seq_file_family(const struct seq_file *seq)
2951 {
2952         const struct tcp_seq_afinfo *afinfo;
2953
2954 #ifdef CONFIG_BPF_SYSCALL
2955         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2956         if (seq->op == &bpf_iter_tcp_seq_ops)
2957                 return AF_UNSPEC;
2958 #endif
2959
2960         /* Iterated from proc fs */
2961         afinfo = pde_data(file_inode(seq->file));
2962         return afinfo->family;
2963 }
2964
2965 static const struct seq_operations tcp4_seq_ops = {
2966         .show           = tcp4_seq_show,
2967         .start          = tcp_seq_start,
2968         .next           = tcp_seq_next,
2969         .stop           = tcp_seq_stop,
2970 };
2971
2972 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2973         .family         = AF_INET,
2974 };
2975
2976 static int __net_init tcp4_proc_init_net(struct net *net)
2977 {
2978         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2979                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2980                 return -ENOMEM;
2981         return 0;
2982 }
2983
2984 static void __net_exit tcp4_proc_exit_net(struct net *net)
2985 {
2986         remove_proc_entry("tcp", net->proc_net);
2987 }
2988
2989 static struct pernet_operations tcp4_net_ops = {
2990         .init = tcp4_proc_init_net,
2991         .exit = tcp4_proc_exit_net,
2992 };
2993
2994 int __init tcp4_proc_init(void)
2995 {
2996         return register_pernet_subsys(&tcp4_net_ops);
2997 }
2998
2999 void tcp4_proc_exit(void)
3000 {
3001         unregister_pernet_subsys(&tcp4_net_ops);
3002 }
3003 #endif /* CONFIG_PROC_FS */
3004
3005 /* @wake is one when sk_stream_write_space() calls us.
3006  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3007  * This mimics the strategy used in sock_def_write_space().
3008  */
3009 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3010 {
3011         const struct tcp_sock *tp = tcp_sk(sk);
3012         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3013                             READ_ONCE(tp->snd_nxt);
3014
3015         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3016 }
3017 EXPORT_SYMBOL(tcp_stream_memory_free);
3018
3019 struct proto tcp_prot = {
3020         .name                   = "TCP",
3021         .owner                  = THIS_MODULE,
3022         .close                  = tcp_close,
3023         .pre_connect            = tcp_v4_pre_connect,
3024         .connect                = tcp_v4_connect,
3025         .disconnect             = tcp_disconnect,
3026         .accept                 = inet_csk_accept,
3027         .ioctl                  = tcp_ioctl,
3028         .init                   = tcp_v4_init_sock,
3029         .destroy                = tcp_v4_destroy_sock,
3030         .shutdown               = tcp_shutdown,
3031         .setsockopt             = tcp_setsockopt,
3032         .getsockopt             = tcp_getsockopt,
3033         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3034         .keepalive              = tcp_set_keepalive,
3035         .recvmsg                = tcp_recvmsg,
3036         .sendmsg                = tcp_sendmsg,
3037         .sendpage               = tcp_sendpage,
3038         .backlog_rcv            = tcp_v4_do_rcv,
3039         .release_cb             = tcp_release_cb,
3040         .hash                   = inet_hash,
3041         .unhash                 = inet_unhash,
3042         .get_port               = inet_csk_get_port,
3043         .put_port               = inet_put_port,
3044 #ifdef CONFIG_BPF_SYSCALL
3045         .psock_update_sk_prot   = tcp_bpf_update_proto,
3046 #endif
3047         .enter_memory_pressure  = tcp_enter_memory_pressure,
3048         .leave_memory_pressure  = tcp_leave_memory_pressure,
3049         .stream_memory_free     = tcp_stream_memory_free,
3050         .sockets_allocated      = &tcp_sockets_allocated,
3051         .orphan_count           = &tcp_orphan_count,
3052         .memory_allocated       = &tcp_memory_allocated,
3053         .memory_pressure        = &tcp_memory_pressure,
3054         .sysctl_mem             = sysctl_tcp_mem,
3055         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3056         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3057         .max_header             = MAX_TCP_HEADER,
3058         .obj_size               = sizeof(struct tcp_sock),
3059         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3060         .twsk_prot              = &tcp_timewait_sock_ops,
3061         .rsk_prot               = &tcp_request_sock_ops,
3062         .h.hashinfo             = &tcp_hashinfo,
3063         .no_autobind            = true,
3064         .diag_destroy           = tcp_abort,
3065 };
3066 EXPORT_SYMBOL(tcp_prot);
3067
3068 static void __net_exit tcp_sk_exit(struct net *net)
3069 {
3070         struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3071
3072         if (net->ipv4.tcp_congestion_control)
3073                 bpf_module_put(net->ipv4.tcp_congestion_control,
3074                                net->ipv4.tcp_congestion_control->owner);
3075         if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3076                 kfree(tcp_death_row);
3077 }
3078
3079 static int __net_init tcp_sk_init(struct net *net)
3080 {
3081         int cnt;
3082
3083         net->ipv4.sysctl_tcp_ecn = 2;
3084         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3085
3086         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3087         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3088         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3089         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3090         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3091
3092         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3093         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3094         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3095
3096         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3097         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3098         net->ipv4.sysctl_tcp_syncookies = 1;
3099         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3100         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3101         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3102         net->ipv4.sysctl_tcp_orphan_retries = 0;
3103         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3104         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3105         net->ipv4.sysctl_tcp_tw_reuse = 2;
3106         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3107
3108         net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3109         if (!net->ipv4.tcp_death_row)
3110                 return -ENOMEM;
3111         refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3112         cnt = tcp_hashinfo.ehash_mask + 1;
3113         net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3114         net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3115
3116         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3117         net->ipv4.sysctl_tcp_sack = 1;
3118         net->ipv4.sysctl_tcp_window_scaling = 1;
3119         net->ipv4.sysctl_tcp_timestamps = 1;
3120         net->ipv4.sysctl_tcp_early_retrans = 3;
3121         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3122         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3123         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3124         net->ipv4.sysctl_tcp_max_reordering = 300;
3125         net->ipv4.sysctl_tcp_dsack = 1;
3126         net->ipv4.sysctl_tcp_app_win = 31;
3127         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3128         net->ipv4.sysctl_tcp_frto = 2;
3129         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3130         /* This limits the percentage of the congestion window which we
3131          * will allow a single TSO frame to consume.  Building TSO frames
3132          * which are too large can cause TCP streams to be bursty.
3133          */
3134         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3135         /* Default TSQ limit of 16 TSO segments */
3136         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3137         /* rfc5961 challenge ack rate limiting */
3138         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3139         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3140         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3141         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3142         net->ipv4.sysctl_tcp_autocorking = 1;
3143         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3144         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3145         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3146         if (net != &init_net) {
3147                 memcpy(net->ipv4.sysctl_tcp_rmem,
3148                        init_net.ipv4.sysctl_tcp_rmem,
3149                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3150                 memcpy(net->ipv4.sysctl_tcp_wmem,
3151                        init_net.ipv4.sysctl_tcp_wmem,
3152                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3153         }
3154         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3155         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3156         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3157         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3158         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3159         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3160
3161         /* Reno is always built in */
3162         if (!net_eq(net, &init_net) &&
3163             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3164                                init_net.ipv4.tcp_congestion_control->owner))
3165                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3166         else
3167                 net->ipv4.tcp_congestion_control = &tcp_reno;
3168
3169         return 0;
3170 }
3171
3172 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3173 {
3174         struct net *net;
3175
3176         list_for_each_entry(net, net_exit_list, exit_list)
3177                 tcp_fastopen_ctx_destroy(net);
3178 }
3179
3180 static struct pernet_operations __net_initdata tcp_sk_ops = {
3181        .init       = tcp_sk_init,
3182        .exit       = tcp_sk_exit,
3183        .exit_batch = tcp_sk_exit_batch,
3184 };
3185
3186 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3187 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3188                      struct sock_common *sk_common, uid_t uid)
3189
3190 #define INIT_BATCH_SZ 16
3191
3192 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3193 {
3194         struct bpf_tcp_iter_state *iter = priv_data;
3195         int err;
3196
3197         err = bpf_iter_init_seq_net(priv_data, aux);
3198         if (err)
3199                 return err;
3200
3201         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3202         if (err) {
3203                 bpf_iter_fini_seq_net(priv_data);
3204                 return err;
3205         }
3206
3207         return 0;
3208 }
3209
3210 static void bpf_iter_fini_tcp(void *priv_data)
3211 {
3212         struct bpf_tcp_iter_state *iter = priv_data;
3213
3214         bpf_iter_fini_seq_net(priv_data);
3215         kvfree(iter->batch);
3216 }
3217
3218 static const struct bpf_iter_seq_info tcp_seq_info = {
3219         .seq_ops                = &bpf_iter_tcp_seq_ops,
3220         .init_seq_private       = bpf_iter_init_tcp,
3221         .fini_seq_private       = bpf_iter_fini_tcp,
3222         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3223 };
3224
3225 static const struct bpf_func_proto *
3226 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3227                             const struct bpf_prog *prog)
3228 {
3229         switch (func_id) {
3230         case BPF_FUNC_setsockopt:
3231                 return &bpf_sk_setsockopt_proto;
3232         case BPF_FUNC_getsockopt:
3233                 return &bpf_sk_getsockopt_proto;
3234         default:
3235                 return NULL;
3236         }
3237 }
3238
3239 static struct bpf_iter_reg tcp_reg_info = {
3240         .target                 = "tcp",
3241         .ctx_arg_info_size      = 1,
3242         .ctx_arg_info           = {
3243                 { offsetof(struct bpf_iter__tcp, sk_common),
3244                   PTR_TO_BTF_ID_OR_NULL },
3245         },
3246         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3247         .seq_info               = &tcp_seq_info,
3248 };
3249
3250 static void __init bpf_iter_register(void)
3251 {
3252         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3253         if (bpf_iter_reg_target(&tcp_reg_info))
3254                 pr_warn("Warning: could not register bpf iterator tcp\n");
3255 }
3256
3257 #endif
3258
3259 void __init tcp_v4_init(void)
3260 {
3261         int cpu, res;
3262
3263         for_each_possible_cpu(cpu) {
3264                 struct sock *sk;
3265
3266                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3267                                            IPPROTO_TCP, &init_net);
3268                 if (res)
3269                         panic("Failed to create the TCP control socket.\n");
3270                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3271
3272                 /* Please enforce IP_DF and IPID==0 for RST and
3273                  * ACK sent in SYN-RECV and TIME-WAIT state.
3274                  */
3275                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3276
3277                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3278         }
3279         if (register_pernet_subsys(&tcp_sk_ops))
3280                 panic("Failed to create the TCP control socket.\n");
3281
3282 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3283         bpf_iter_register();
3284 #endif
3285 }