net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97 {
  98         return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                               ip_hdr(skb)->saddr,
 100                               tcp_hdr(skb)->dest,
 101                               tcp_hdr(skb)->source);
 102 }
 103
 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105 {
 106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 112         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 113         struct tcp_sock *tp = tcp_sk(sk);
 114         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 115
 116         if (reuse == 2) {
 117                 /* Still does not detect *everything* that goes through
 118                  * lo, since we require a loopback src or dst address
 119                  * or direct binding to 'lo' interface.
 120                  */
 121                 bool loopback = false;
 122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                         loopback = true;
 124 #if IS_ENABLED(CONFIG_IPV6)
 125                 if (tw->tw_family == AF_INET6) {
 126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                 loopback = true;
 131                 } else
 132 #endif
 133                 {
 134                         if (ipv4_is_loopback(tw->tw_daddr) ||
 135                             ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                 loopback = true;
 137                 }
 138                 if (!loopback)
 139                         reuse = 0;
 140         }
 141
 142         /* With PAWS, it is safe from the viewpoint
 143            of data integrity. Even without PAWS it is safe provided sequence
 144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146            Actually, the idea is close to VJ's one, only timestamp cache is
 147            held not per host, but per port pair and TW bucket is used as state
 148            holder.
 149
 150            If TW bucket has been already destroyed we fall back to VJ's scheme
 151            and use initial timestamp retrieved from peer table.
 152          */
 153         if (tcptw->tw_ts_recent_stamp &&
 154             (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                             tcptw->tw_ts_recent_stamp)))) {
 156                 /* In case of repair and re-using TIME-WAIT sockets we still
 157                  * want to be sure that it is safe as above but honor the
 158                  * sequence numbers and time stamps set as part of the repair
 159                  * process.
 160                  *
 161                  * Without this check re-using a TIME-WAIT socket with TCP
 162                  * repair would accumulate a -1 on the repair assigned
 163                  * sequence number. The first time it is reused the sequence
 164                  * is -1, the second time -2, etc. This fixes that issue
 165                  * without appearing to create any others.
 166                  */
 167                 if (likely(!tp->repair)) {
 168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                         if (!seq)
 171                                 seq = 1;
 172                         WRITE_ONCE(tp->write_seq, seq);
 173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                 }
 176                 sock_hold(sktw);
 177                 return 1;
 178         }
 179
 180         return 0;
 181 }
 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                               int addr_len)
 186 {
 187         /* This check is replicated from tcp_v4_connect() and intended to
 188          * prevent BPF program called below from accessing bytes that are out
 189          * of the bound specified by user in addr_len.
 190          */
 191         if (addr_len < sizeof(struct sockaddr_in))
 192                 return -EINVAL;
 193
 194         sock_owned_by_me(sk);
 195
 196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197 }
 198
 199 /* This will initiate an outgoing connection. */
 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201 {
 202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203         struct inet_sock *inet = inet_sk(sk);
 204         struct tcp_sock *tp = tcp_sk(sk);
 205         __be16 orig_sport, orig_dport;
 206         __be32 daddr, nexthop;
 207         struct flowi4 *fl4;
 208         struct rtable *rt;
 209         int err;
 210         struct ip_options_rcu *inet_opt;
 211         struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 212
 213         if (addr_len < sizeof(struct sockaddr_in))
 214                 return -EINVAL;
 215
 216         if (usin->sin_family != AF_INET)
 217                 return -EAFNOSUPPORT;
 218
 219         nexthop = daddr = usin->sin_addr.s_addr;
 220         inet_opt = rcu_dereference_protected(inet->inet_opt,
 221                                              lockdep_sock_is_held(sk));
 222         if (inet_opt && inet_opt->opt.srr) {
 223                 if (!daddr)
 224                         return -EINVAL;
 225                 nexthop = inet_opt->opt.faddr;
 226         }
 227
 228         orig_sport = inet->inet_sport;
 229         orig_dport = usin->sin_port;
 230         fl4 = &inet->cork.fl.u.ip4;
 231         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 232                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 233                               orig_dport, sk);
 234         if (IS_ERR(rt)) {
 235                 err = PTR_ERR(rt);
 236                 if (err == -ENETUNREACH)
 237                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 238                 return err;
 239         }
 240
 241         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 242                 ip_rt_put(rt);
 243                 return -ENETUNREACH;
 244         }
 245
 246         if (!inet_opt || !inet_opt->opt.srr)
 247                 daddr = fl4->daddr;
 248
 249         if (!inet->inet_saddr)
 250                 inet->inet_saddr = fl4->saddr;
 251         sk_rcv_saddr_set(sk, inet->inet_saddr);
 252
 253         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 254                 /* Reset inherited state */
 255                 tp->rx_opt.ts_recent       = 0;
 256                 tp->rx_opt.ts_recent_stamp = 0;
 257                 if (likely(!tp->repair))
 258                         WRITE_ONCE(tp->write_seq, 0);
 259         }
 260
 261         inet->inet_dport = usin->sin_port;
 262         sk_daddr_set(sk, daddr);
 263
 264         inet_csk(sk)->icsk_ext_hdr_len = 0;
 265         if (inet_opt)
 266                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 267
 268         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 269
 270         /* Socket identity is still unknown (sport may be zero).
 271          * However we set state to SYN-SENT and not releasing socket
 272          * lock select source port, enter ourselves into the hash tables and
 273          * complete initialization after this.
 274          */
 275         tcp_set_state(sk, TCP_SYN_SENT);
 276         err = inet_hash_connect(tcp_death_row, sk);
 277         if (err)
 278                 goto failure;
 279
 280         sk_set_txhash(sk);
 281
 282         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 283                                inet->inet_sport, inet->inet_dport, sk);
 284         if (IS_ERR(rt)) {
 285                 err = PTR_ERR(rt);
 286                 rt = NULL;
 287                 goto failure;
 288         }
 289         /* OK, now commit destination to socket.  */
 290         sk->sk_gso_type = SKB_GSO_TCPV4;
 291         sk_setup_caps(sk, &rt->dst);
 292         rt = NULL;
 293
 294         if (likely(!tp->repair)) {
 295                 if (!tp->write_seq)
 296                         WRITE_ONCE(tp->write_seq,
 297                                    secure_tcp_seq(inet->inet_saddr,
 298                                                   inet->inet_daddr,
 299                                                   inet->inet_sport,
 300                                                   usin->sin_port));
 301                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 302                                                  inet->inet_saddr,
 303                                                  inet->inet_daddr);
 304         }
 305
 306         inet->inet_id = prandom_u32();
 307
 308         if (tcp_fastopen_defer_connect(sk, &err))
 309                 return err;
 310         if (err)
 311                 goto failure;
 312
 313         err = tcp_connect(sk);
 314
 315         if (err)
 316                 goto failure;
 317
 318         return 0;
 319
 320 failure:
 321         /*
 322          * This unhashes the socket and releases the local port,
 323          * if necessary.
 324          */
 325         tcp_set_state(sk, TCP_CLOSE);
 326         ip_rt_put(rt);
 327         sk->sk_route_caps = 0;
 328         inet->inet_dport = 0;
 329         return err;
 330 }
 331 EXPORT_SYMBOL(tcp_v4_connect);
 332
 333 /*
 334  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 335  * It can be called through tcp_release_cb() if socket was owned by user
 336  * at the time tcp_v4_err() was called to handle ICMP message.
 337  */
 338 void tcp_v4_mtu_reduced(struct sock *sk)
 339 {
 340         struct inet_sock *inet = inet_sk(sk);
 341         struct dst_entry *dst;
 342         u32 mtu;
 343
 344         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 345                 return;
 346         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 347         dst = inet_csk_update_pmtu(sk, mtu);
 348         if (!dst)
 349                 return;
 350
 351         /* Something is about to be wrong... Remember soft error
 352          * for the case, if this connection will not able to recover.
 353          */
 354         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 355                 sk->sk_err_soft = EMSGSIZE;
 356
 357         mtu = dst_mtu(dst);
 358
 359         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 360             ip_sk_accept_pmtu(sk) &&
 361             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 362                 tcp_sync_mss(sk, mtu);
 363
 364                 /* Resend the TCP packet because it's
 365                  * clear that the old packet has been
 366                  * dropped. This is the new "fast" path mtu
 367                  * discovery.
 368                  */
 369                 tcp_simple_retransmit(sk);
 370         } /* else let the usual retransmit timer handle it */
 371 }
 372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 373
 374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 375 {
 376         struct dst_entry *dst = __sk_dst_check(sk, 0);
 377
 378         if (dst)
 379                 dst->ops->redirect(dst, sk, skb);
 380 }
 381
 382
 383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 385 {
 386         struct request_sock *req = inet_reqsk(sk);
 387         struct net *net = sock_net(sk);
 388
 389         /* ICMPs are not backlogged, hence we cannot get
 390          * an established socket here.
 391          */
 392         if (seq != tcp_rsk(req)->snt_isn) {
 393                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394         } else if (abort) {
 395                 /*
 396                  * Still in SYN_RECV, just remove it silently.
 397                  * There is no good way to pass the error to the newly
 398                  * created socket, and POSIX does not want network
 399                  * errors returned from accept().
 400                  */
 401                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 402                 tcp_listendrop(req->rsk_listener);
 403         }
 404         reqsk_put(req);
 405 }
 406 EXPORT_SYMBOL(tcp_req_err);
 407
 408 /* TCP-LD (RFC 6069) logic */
 409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 410 {
 411         struct inet_connection_sock *icsk = inet_csk(sk);
 412         struct tcp_sock *tp = tcp_sk(sk);
 413         struct sk_buff *skb;
 414         s32 remaining;
 415         u32 delta_us;
 416
 417         if (sock_owned_by_user(sk))
 418                 return;
 419
 420         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 421             !icsk->icsk_backoff)
 422                 return;
 423
 424         skb = tcp_rtx_queue_head(sk);
 425         if (WARN_ON_ONCE(!skb))
 426                 return;
 427
 428         icsk->icsk_backoff--;
 429         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 430         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 431
 432         tcp_mstamp_refresh(tp);
 433         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 434         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 435
 436         if (remaining > 0) {
 437                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 438                                           remaining, TCP_RTO_MAX);
 439         } else {
 440                 /* RTO revert clocked out retransmission.
 441                  * Will retransmit now.
 442                  */
 443                 tcp_retransmit_timer(sk);
 444         }
 445 }
 446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 447
 448 /*
 449  * This routine is called by the ICMP module when it gets some
 450  * sort of error condition.  If err < 0 then the socket should
 451  * be closed and the error returned to the user.  If err > 0
 452  * it's just the icmp type << 8 | icmp code.  After adjustment
 453  * header points to the first 8 bytes of the tcp header.  We need
 454  * to find the appropriate port.
 455  *
 456  * The locking strategy used here is very "optimistic". When
 457  * someone else accesses the socket the ICMP is just dropped
 458  * and for some paths there is no check at all.
 459  * A more general error queue to queue errors for later handling
 460  * is probably better.
 461  *
 462  */
 463
 464 int tcp_v4_err(struct sk_buff *skb, u32 info)
 465 {
 466         const struct iphdr *iph = (const struct iphdr *)skb->data;
 467         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 468         struct tcp_sock *tp;
 469         struct inet_sock *inet;
 470         const int type = icmp_hdr(skb)->type;
 471         const int code = icmp_hdr(skb)->code;
 472         struct sock *sk;
 473         struct request_sock *fastopen;
 474         u32 seq, snd_una;
 475         int err;
 476         struct net *net = dev_net(skb->dev);
 477
 478         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 479                                        th->dest, iph->saddr, ntohs(th->source),
 480                                        inet_iif(skb), 0);
 481         if (!sk) {
 482                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 483                 return -ENOENT;
 484         }
 485         if (sk->sk_state == TCP_TIME_WAIT) {
 486                 inet_twsk_put(inet_twsk(sk));
 487                 return 0;
 488         }
 489         seq = ntohl(th->seq);
 490         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 491                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 492                                      type == ICMP_TIME_EXCEEDED ||
 493                                      (type == ICMP_DEST_UNREACH &&
 494                                       (code == ICMP_NET_UNREACH ||
 495                                        code == ICMP_HOST_UNREACH)));
 496                 return 0;
 497         }
 498
 499         bh_lock_sock(sk);
 500         /* If too many ICMPs get dropped on busy
 501          * servers this needs to be solved differently.
 502          * We do take care of PMTU discovery (RFC1191) special case :
 503          * we can receive locally generated ICMP messages while socket is held.
 504          */
 505         if (sock_owned_by_user(sk)) {
 506                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 507                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 508         }
 509         if (sk->sk_state == TCP_CLOSE)
 510                 goto out;
 511
 512         if (static_branch_unlikely(&ip4_min_ttl)) {
 513                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 514                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 515                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 516                         goto out;
 517                 }
 518         }
 519
 520         tp = tcp_sk(sk);
 521         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 522         fastopen = rcu_dereference(tp->fastopen_rsk);
 523         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 524         if (sk->sk_state != TCP_LISTEN &&
 525             !between(seq, snd_una, tp->snd_nxt)) {
 526                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 527                 goto out;
 528         }
 529
 530         switch (type) {
 531         case ICMP_REDIRECT:
 532                 if (!sock_owned_by_user(sk))
 533                         do_redirect(skb, sk);
 534                 goto out;
 535         case ICMP_SOURCE_QUENCH:
 536                 /* Just silently ignore these. */
 537                 goto out;
 538         case ICMP_PARAMETERPROB:
 539                 err = EPROTO;
 540                 break;
 541         case ICMP_DEST_UNREACH:
 542                 if (code > NR_ICMP_UNREACH)
 543                         goto out;
 544
 545                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 546                         /* We are not interested in TCP_LISTEN and open_requests
 547                          * (SYN-ACKs send out by Linux are always <576bytes so
 548                          * they should go through unfragmented).
 549                          */
 550                         if (sk->sk_state == TCP_LISTEN)
 551                                 goto out;
 552
 553                         WRITE_ONCE(tp->mtu_info, info);
 554                         if (!sock_owned_by_user(sk)) {
 555                                 tcp_v4_mtu_reduced(sk);
 556                         } else {
 557                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 558                                         sock_hold(sk);
 559                         }
 560                         goto out;
 561                 }
 562
 563                 err = icmp_err_convert[code].errno;
 564                 /* check if this ICMP message allows revert of backoff.
 565                  * (see RFC 6069)
 566                  */
 567                 if (!fastopen &&
 568                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 569                         tcp_ld_RTO_revert(sk, seq);
 570                 break;
 571         case ICMP_TIME_EXCEEDED:
 572                 err = EHOSTUNREACH;
 573                 break;
 574         default:
 575                 goto out;
 576         }
 577
 578         switch (sk->sk_state) {
 579         case TCP_SYN_SENT:
 580         case TCP_SYN_RECV:
 581                 /* Only in fast or simultaneous open. If a fast open socket is
 582                  * already accepted it is treated as a connected one below.
 583                  */
 584                 if (fastopen && !fastopen->sk)
 585                         break;
 586
 587                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 588
 589                 if (!sock_owned_by_user(sk)) {
 590                         sk->sk_err = err;
 591
 592                         sk_error_report(sk);
 593
 594                         tcp_done(sk);
 595                 } else {
 596                         sk->sk_err_soft = err;
 597                 }
 598                 goto out;
 599         }
 600
 601         /* If we've already connected we will keep trying
 602          * until we time out, or the user gives up.
 603          *
 604          * rfc1122 4.2.3.9 allows to consider as hard errors
 605          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 606          * but it is obsoleted by pmtu discovery).
 607          *
 608          * Note, that in modern internet, where routing is unreliable
 609          * and in each dark corner broken firewalls sit, sending random
 610          * errors ordered by their masters even this two messages finally lose
 611          * their original sense (even Linux sends invalid PORT_UNREACHs)
 612          *
 613          * Now we are in compliance with RFCs.
 614          *                                                      --ANK (980905)
 615          */
 616
 617         inet = inet_sk(sk);
 618         if (!sock_owned_by_user(sk) && inet->recverr) {
 619                 sk->sk_err = err;
 620                 sk_error_report(sk);
 621         } else  { /* Only an error on timeout */
 622                 sk->sk_err_soft = err;
 623         }
 624
 625 out:
 626         bh_unlock_sock(sk);
 627         sock_put(sk);
 628         return 0;
 629 }
 630
 631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 632 {
 633         struct tcphdr *th = tcp_hdr(skb);
 634
 635         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 636         skb->csum_start = skb_transport_header(skb) - skb->head;
 637         skb->csum_offset = offsetof(struct tcphdr, check);
 638 }
 639
 640 /* This routine computes an IPv4 TCP checksum. */
 641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 642 {
 643         const struct inet_sock *inet = inet_sk(sk);
 644
 645         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 646 }
 647 EXPORT_SYMBOL(tcp_v4_send_check);
 648
 649 /*
 650  *      This routine will send an RST to the other tcp.
 651  *
 652  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 653  *                    for reset.
 654  *      Answer: if a packet caused RST, it is not for a socket
 655  *              existing in our system, if it is matched to a socket,
 656  *              it is just duplicate segment or bug in other side's TCP.
 657  *              So that we build reply only basing on parameters
 658  *              arrived with segment.
 659  *      Exception: precedence violation. We do not implement it in any case.
 660  */
 661
 662 #ifdef CONFIG_TCP_MD5SIG
 663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 664 #else
 665 #define OPTION_BYTES sizeof(__be32)
 666 #endif
 667
 668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 669 {
 670         const struct tcphdr *th = tcp_hdr(skb);
 671         struct {
 672                 struct tcphdr th;
 673                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 674         } rep;
 675         struct ip_reply_arg arg;
 676 #ifdef CONFIG_TCP_MD5SIG
 677         struct tcp_md5sig_key *key = NULL;
 678         const __u8 *hash_location = NULL;
 679         unsigned char newhash[16];
 680         int genhash;
 681         struct sock *sk1 = NULL;
 682 #endif
 683         u64 transmit_time = 0;
 684         struct sock *ctl_sk;
 685         struct net *net;
 686
 687         /* Never send a reset in response to a reset. */
 688         if (th->rst)
 689                 return;
 690
 691         /* If sk not NULL, it means we did a successful lookup and incoming
 692          * route had to be correct. prequeue might have dropped our dst.
 693          */
 694         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 695                 return;
 696
 697         /* Swap the send and the receive. */
 698         memset(&rep, 0, sizeof(rep));
 699         rep.th.dest   = th->source;
 700         rep.th.source = th->dest;
 701         rep.th.doff   = sizeof(struct tcphdr) / 4;
 702         rep.th.rst    = 1;
 703
 704         if (th->ack) {
 705                 rep.th.seq = th->ack_seq;
 706         } else {
 707                 rep.th.ack = 1;
 708                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 709                                        skb->len - (th->doff << 2));
 710         }
 711
 712         memset(&arg, 0, sizeof(arg));
 713         arg.iov[0].iov_base = (unsigned char *)&rep;
 714         arg.iov[0].iov_len  = sizeof(rep.th);
 715
 716         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 717 #ifdef CONFIG_TCP_MD5SIG
 718         rcu_read_lock();
 719         hash_location = tcp_parse_md5sig_option(th);
 720         if (sk && sk_fullsock(sk)) {
 721                 const union tcp_md5_addr *addr;
 722                 int l3index;
 723
 724                 /* sdif set, means packet ingressed via a device
 725                  * in an L3 domain and inet_iif is set to it.
 726                  */
 727                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 728                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 729                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 730         } else if (hash_location) {
 731                 const union tcp_md5_addr *addr;
 732                 int sdif = tcp_v4_sdif(skb);
 733                 int dif = inet_iif(skb);
 734                 int l3index;
 735
 736                 /*
 737                  * active side is lost. Try to find listening socket through
 738                  * source port, and then find md5 key through listening socket.
 739                  * we are not loose security here:
 740                  * Incoming packet is checked with md5 hash with finding key,
 741                  * no RST generated if md5 hash doesn't match.
 742                  */
 743                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 744                                              ip_hdr(skb)->saddr,
 745                                              th->source, ip_hdr(skb)->daddr,
 746                                              ntohs(th->source), dif, sdif);
 747                 /* don't send rst if it can't find key */
 748                 if (!sk1)
 749                         goto out;
 750
 751                 /* sdif set, means packet ingressed via a device
 752                  * in an L3 domain and dif is set to it.
 753                  */
 754                 l3index = sdif ? dif : 0;
 755                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 756                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 757                 if (!key)
 758                         goto out;
 759
 760
 761                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 762                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 763                         goto out;
 764
 765         }
 766
 767         if (key) {
 768                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 769                                    (TCPOPT_NOP << 16) |
 770                                    (TCPOPT_MD5SIG << 8) |
 771                                    TCPOLEN_MD5SIG);
 772                 /* Update length and the length the header thinks exists */
 773                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 774                 rep.th.doff = arg.iov[0].iov_len / 4;
 775
 776                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 777                                      key, ip_hdr(skb)->saddr,
 778                                      ip_hdr(skb)->daddr, &rep.th);
 779         }
 780 #endif
 781         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 782         if (rep.opt[0] == 0) {
 783                 __be32 mrst = mptcp_reset_option(skb);
 784
 785                 if (mrst) {
 786                         rep.opt[0] = mrst;
 787                         arg.iov[0].iov_len += sizeof(mrst);
 788                         rep.th.doff = arg.iov[0].iov_len / 4;
 789                 }
 790         }
 791
 792         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 793                                       ip_hdr(skb)->saddr, /* XXX */
 794                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 795         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 796         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 797
 798         /* When socket is gone, all binding information is lost.
 799          * routing might fail in this case. No choice here, if we choose to force
 800          * input interface, we will misroute in case of asymmetric route.
 801          */
 802         if (sk) {
 803                 arg.bound_dev_if = sk->sk_bound_dev_if;
 804                 if (sk_fullsock(sk))
 805                         trace_tcp_send_reset(sk, skb);
 806         }
 807
 808         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 809                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 810
 811         arg.tos = ip_hdr(skb)->tos;
 812         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 813         local_bh_disable();
 814         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 815         sock_net_set(ctl_sk, net);
 816         if (sk) {
 817                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 818                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 819                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 820                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 821                 transmit_time = tcp_transmit_time(sk);
 822         }
 823         ip_send_unicast_reply(ctl_sk,
 824                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 825                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 826                               &arg, arg.iov[0].iov_len,
 827                               transmit_time);
 828
 829         ctl_sk->sk_mark = 0;
 830         sock_net_set(ctl_sk, &init_net);
 831         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 832         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 833         local_bh_enable();
 834
 835 #ifdef CONFIG_TCP_MD5SIG
 836 out:
 837         rcu_read_unlock();
 838 #endif
 839 }
 840
 841 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 842    outside socket context is ugly, certainly. What can I do?
 843  */
 844
 845 static void tcp_v4_send_ack(const struct sock *sk,
 846                             struct sk_buff *skb, u32 seq, u32 ack,
 847                             u32 win, u32 tsval, u32 tsecr, int oif,
 848                             struct tcp_md5sig_key *key,
 849                             int reply_flags, u8 tos)
 850 {
 851         const struct tcphdr *th = tcp_hdr(skb);
 852         struct {
 853                 struct tcphdr th;
 854                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 855 #ifdef CONFIG_TCP_MD5SIG
 856                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 857 #endif
 858                         ];
 859         } rep;
 860         struct net *net = sock_net(sk);
 861         struct ip_reply_arg arg;
 862         struct sock *ctl_sk;
 863         u64 transmit_time;
 864
 865         memset(&rep.th, 0, sizeof(struct tcphdr));
 866         memset(&arg, 0, sizeof(arg));
 867
 868         arg.iov[0].iov_base = (unsigned char *)&rep;
 869         arg.iov[0].iov_len  = sizeof(rep.th);
 870         if (tsecr) {
 871                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 872                                    (TCPOPT_TIMESTAMP << 8) |
 873                                    TCPOLEN_TIMESTAMP);
 874                 rep.opt[1] = htonl(tsval);
 875                 rep.opt[2] = htonl(tsecr);
 876                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 877         }
 878
 879         /* Swap the send and the receive. */
 880         rep.th.dest    = th->source;
 881         rep.th.source  = th->dest;
 882         rep.th.doff    = arg.iov[0].iov_len / 4;
 883         rep.th.seq     = htonl(seq);
 884         rep.th.ack_seq = htonl(ack);
 885         rep.th.ack     = 1;
 886         rep.th.window  = htons(win);
 887
 888 #ifdef CONFIG_TCP_MD5SIG
 889         if (key) {
 890                 int offset = (tsecr) ? 3 : 0;
 891
 892                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 893                                           (TCPOPT_NOP << 16) |
 894                                           (TCPOPT_MD5SIG << 8) |
 895                                           TCPOLEN_MD5SIG);
 896                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 897                 rep.th.doff = arg.iov[0].iov_len/4;
 898
 899                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 900                                     key, ip_hdr(skb)->saddr,
 901                                     ip_hdr(skb)->daddr, &rep.th);
 902         }
 903 #endif
 904         arg.flags = reply_flags;
 905         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 906                                       ip_hdr(skb)->saddr, /* XXX */
 907                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 908         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 909         if (oif)
 910                 arg.bound_dev_if = oif;
 911         arg.tos = tos;
 912         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 913         local_bh_disable();
 914         ctl_sk = this_cpu_read(ipv4_tcp_sk);
 915         sock_net_set(ctl_sk, net);
 916         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 917                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 918         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 919                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 920         transmit_time = tcp_transmit_time(sk);
 921         ip_send_unicast_reply(ctl_sk,
 922                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 923                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 924                               &arg, arg.iov[0].iov_len,
 925                               transmit_time);
 926
 927         ctl_sk->sk_mark = 0;
 928         sock_net_set(ctl_sk, &init_net);
 929         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 930         local_bh_enable();
 931 }
 932
 933 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 934 {
 935         struct inet_timewait_sock *tw = inet_twsk(sk);
 936         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 937
 938         tcp_v4_send_ack(sk, skb,
 939                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 940                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 941                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 942                         tcptw->tw_ts_recent,
 943                         tw->tw_bound_dev_if,
 944                         tcp_twsk_md5_key(tcptw),
 945                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 946                         tw->tw_tos
 947                         );
 948
 949         inet_twsk_put(tw);
 950 }
 951
 952 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 953                                   struct request_sock *req)
 954 {
 955         const union tcp_md5_addr *addr;
 956         int l3index;
 957
 958         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 959          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 960          */
 961         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 962                                              tcp_sk(sk)->snd_nxt;
 963
 964         /* RFC 7323 2.3
 965          * The window field (SEG.WND) of every outgoing segment, with the
 966          * exception of <SYN> segments, MUST be right-shifted by
 967          * Rcv.Wind.Shift bits:
 968          */
 969         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 970         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 971         tcp_v4_send_ack(sk, skb, seq,
 972                         tcp_rsk(req)->rcv_nxt,
 973                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 974                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 975                         req->ts_recent,
 976                         0,
 977                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 978                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 979                         ip_hdr(skb)->tos);
 980 }
 981
 982 /*
 983  *      Send a SYN-ACK after having received a SYN.
 984  *      This still operates on a request_sock only, not on a big
 985  *      socket.
 986  */
 987 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 988                               struct flowi *fl,
 989                               struct request_sock *req,
 990                               struct tcp_fastopen_cookie *foc,
 991                               enum tcp_synack_type synack_type,
 992                               struct sk_buff *syn_skb)
 993 {
 994         const struct inet_request_sock *ireq = inet_rsk(req);
 995         struct flowi4 fl4;
 996         int err = -1;
 997         struct sk_buff *skb;
 998         u8 tos;
 999
1000         /* First, grab a route. */
1001         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1002                 return -1;
1003
1004         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1005
1006         if (skb) {
1007                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1008
1009                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1010                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1012                                 inet_sk(sk)->tos;
1013
1014                 if (!INET_ECN_is_capable(tos) &&
1015                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1016                         tos |= INET_ECN_ECT_0;
1017
1018                 rcu_read_lock();
1019                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1020                                             ireq->ir_rmt_addr,
1021                                             rcu_dereference(ireq->ireq_opt),
1022                                             tos);
1023                 rcu_read_unlock();
1024                 err = net_xmit_eval(err);
1025         }
1026
1027         return err;
1028 }
1029
1030 /*
1031  *      IPv4 request_sock destructor.
1032  */
1033 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1034 {
1035         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1036 }
1037
1038 #ifdef CONFIG_TCP_MD5SIG
1039 /*
1040  * RFC2385 MD5 checksumming requires a mapping of
1041  * IP address->MD5 Key.
1042  * We need to maintain these in the sk structure.
1043  */
1044
1045 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046 EXPORT_SYMBOL(tcp_md5_needed);
1047
1048 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1049 {
1050         if (!old)
1051                 return true;
1052
1053         /* l3index always overrides non-l3index */
1054         if (old->l3index && new->l3index == 0)
1055                 return false;
1056         if (old->l3index == 0 && new->l3index)
1057                 return true;
1058
1059         return old->prefixlen < new->prefixlen;
1060 }
1061
1062 /* Find the Key structure for an address.  */
1063 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064                                            const union tcp_md5_addr *addr,
1065                                            int family)
1066 {
1067         const struct tcp_sock *tp = tcp_sk(sk);
1068         struct tcp_md5sig_key *key;
1069         const struct tcp_md5sig_info *md5sig;
1070         __be32 mask;
1071         struct tcp_md5sig_key *best_match = NULL;
1072         bool match;
1073
1074         /* caller either holds rcu_read_lock() or socket lock */
1075         md5sig = rcu_dereference_check(tp->md5sig_info,
1076                                        lockdep_sock_is_held(sk));
1077         if (!md5sig)
1078                 return NULL;
1079
1080         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081                                  lockdep_sock_is_held(sk)) {
1082                 if (key->family != family)
1083                         continue;
1084                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1085                         continue;
1086                 if (family == AF_INET) {
1087                         mask = inet_make_mask(key->prefixlen);
1088                         match = (key->addr.a4.s_addr & mask) ==
1089                                 (addr->a4.s_addr & mask);
1090 #if IS_ENABLED(CONFIG_IPV6)
1091                 } else if (family == AF_INET6) {
1092                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093                                                   key->prefixlen);
1094 #endif
1095                 } else {
1096                         match = false;
1097                 }
1098
1099                 if (match && better_md5_match(best_match, key))
1100                         best_match = key;
1101         }
1102         return best_match;
1103 }
1104 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1105
1106 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107                                                       const union tcp_md5_addr *addr,
1108                                                       int family, u8 prefixlen,
1109                                                       int l3index, u8 flags)
1110 {
1111         const struct tcp_sock *tp = tcp_sk(sk);
1112         struct tcp_md5sig_key *key;
1113         unsigned int size = sizeof(struct in_addr);
1114         const struct tcp_md5sig_info *md5sig;
1115
1116         /* caller either holds rcu_read_lock() or socket lock */
1117         md5sig = rcu_dereference_check(tp->md5sig_info,
1118                                        lockdep_sock_is_held(sk));
1119         if (!md5sig)
1120                 return NULL;
1121 #if IS_ENABLED(CONFIG_IPV6)
1122         if (family == AF_INET6)
1123                 size = sizeof(struct in6_addr);
1124 #endif
1125         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126                                  lockdep_sock_is_held(sk)) {
1127                 if (key->family != family)
1128                         continue;
1129                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1130                         continue;
1131                 if (key->l3index != l3index)
1132                         continue;
1133                 if (!memcmp(&key->addr, addr, size) &&
1134                     key->prefixlen == prefixlen)
1135                         return key;
1136         }
1137         return NULL;
1138 }
1139
1140 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1141                                          const struct sock *addr_sk)
1142 {
1143         const union tcp_md5_addr *addr;
1144         int l3index;
1145
1146         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1147                                                  addr_sk->sk_bound_dev_if);
1148         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1149         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1150 }
1151 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1152
1153 /* This can be called on a newly created socket, from other files */
1154 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1155                    int family, u8 prefixlen, int l3index, u8 flags,
1156                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1157 {
1158         /* Add Key to the list */
1159         struct tcp_md5sig_key *key;
1160         struct tcp_sock *tp = tcp_sk(sk);
1161         struct tcp_md5sig_info *md5sig;
1162
1163         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1164         if (key) {
1165                 /* Pre-existing entry - just update that one.
1166                  * Note that the key might be used concurrently.
1167                  * data_race() is telling kcsan that we do not care of
1168                  * key mismatches, since changing MD5 key on live flows
1169                  * can lead to packet drops.
1170                  */
1171                 data_race(memcpy(key->key, newkey, newkeylen));
1172
1173                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1174                  * Also note that a reader could catch new key->keylen value
1175                  * but old key->key[], this is the reason we use __GFP_ZERO
1176                  * at sock_kmalloc() time below these lines.
1177                  */
1178                 WRITE_ONCE(key->keylen, newkeylen);
1179
1180                 return 0;
1181         }
1182
1183         md5sig = rcu_dereference_protected(tp->md5sig_info,
1184                                            lockdep_sock_is_held(sk));
1185         if (!md5sig) {
1186                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1187                 if (!md5sig)
1188                         return -ENOMEM;
1189
1190                 sk_gso_disable(sk);
1191                 INIT_HLIST_HEAD(&md5sig->head);
1192                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1193         }
1194
1195         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1196         if (!key)
1197                 return -ENOMEM;
1198         if (!tcp_alloc_md5sig_pool()) {
1199                 sock_kfree_s(sk, key, sizeof(*key));
1200                 return -ENOMEM;
1201         }
1202
1203         memcpy(key->key, newkey, newkeylen);
1204         key->keylen = newkeylen;
1205         key->family = family;
1206         key->prefixlen = prefixlen;
1207         key->l3index = l3index;
1208         key->flags = flags;
1209         memcpy(&key->addr, addr,
1210                (family == AF_INET6) ? sizeof(struct in6_addr) :
1211                                       sizeof(struct in_addr));
1212         hlist_add_head_rcu(&key->node, &md5sig->head);
1213         return 0;
1214 }
1215 EXPORT_SYMBOL(tcp_md5_do_add);
1216
1217 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1218                    u8 prefixlen, int l3index, u8 flags)
1219 {
1220         struct tcp_md5sig_key *key;
1221
1222         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1223         if (!key)
1224                 return -ENOENT;
1225         hlist_del_rcu(&key->node);
1226         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1227         kfree_rcu(key, rcu);
1228         return 0;
1229 }
1230 EXPORT_SYMBOL(tcp_md5_do_del);
1231
1232 static void tcp_clear_md5_list(struct sock *sk)
1233 {
1234         struct tcp_sock *tp = tcp_sk(sk);
1235         struct tcp_md5sig_key *key;
1236         struct hlist_node *n;
1237         struct tcp_md5sig_info *md5sig;
1238
1239         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1240
1241         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1242                 hlist_del_rcu(&key->node);
1243                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1244                 kfree_rcu(key, rcu);
1245         }
1246 }
1247
1248 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1249                                  sockptr_t optval, int optlen)
1250 {
1251         struct tcp_md5sig cmd;
1252         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1253         const union tcp_md5_addr *addr;
1254         u8 prefixlen = 32;
1255         int l3index = 0;
1256         u8 flags;
1257
1258         if (optlen < sizeof(cmd))
1259                 return -EINVAL;
1260
1261         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1262                 return -EFAULT;
1263
1264         if (sin->sin_family != AF_INET)
1265                 return -EINVAL;
1266
1267         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1268
1269         if (optname == TCP_MD5SIG_EXT &&
1270             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1271                 prefixlen = cmd.tcpm_prefixlen;
1272                 if (prefixlen > 32)
1273                         return -EINVAL;
1274         }
1275
1276         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1277             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1278                 struct net_device *dev;
1279
1280                 rcu_read_lock();
1281                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1282                 if (dev && netif_is_l3_master(dev))
1283                         l3index = dev->ifindex;
1284
1285                 rcu_read_unlock();
1286
1287                 /* ok to reference set/not set outside of rcu;
1288                  * right now device MUST be an L3 master
1289                  */
1290                 if (!dev || !l3index)
1291                         return -EINVAL;
1292         }
1293
1294         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1295
1296         if (!cmd.tcpm_keylen)
1297                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1298
1299         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1300                 return -EINVAL;
1301
1302         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1303                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1304 }
1305
1306 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1307                                    __be32 daddr, __be32 saddr,
1308                                    const struct tcphdr *th, int nbytes)
1309 {
1310         struct tcp4_pseudohdr *bp;
1311         struct scatterlist sg;
1312         struct tcphdr *_th;
1313
1314         bp = hp->scratch;
1315         bp->saddr = saddr;
1316         bp->daddr = daddr;
1317         bp->pad = 0;
1318         bp->protocol = IPPROTO_TCP;
1319         bp->len = cpu_to_be16(nbytes);
1320
1321         _th = (struct tcphdr *)(bp + 1);
1322         memcpy(_th, th, sizeof(*th));
1323         _th->check = 0;
1324
1325         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1326         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1327                                 sizeof(*bp) + sizeof(*th));
1328         return crypto_ahash_update(hp->md5_req);
1329 }
1330
1331 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1332                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1333 {
1334         struct tcp_md5sig_pool *hp;
1335         struct ahash_request *req;
1336
1337         hp = tcp_get_md5sig_pool();
1338         if (!hp)
1339                 goto clear_hash_noput;
1340         req = hp->md5_req;
1341
1342         if (crypto_ahash_init(req))
1343                 goto clear_hash;
1344         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1345                 goto clear_hash;
1346         if (tcp_md5_hash_key(hp, key))
1347                 goto clear_hash;
1348         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1349         if (crypto_ahash_final(req))
1350                 goto clear_hash;
1351
1352         tcp_put_md5sig_pool();
1353         return 0;
1354
1355 clear_hash:
1356         tcp_put_md5sig_pool();
1357 clear_hash_noput:
1358         memset(md5_hash, 0, 16);
1359         return 1;
1360 }
1361
1362 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1363                         const struct sock *sk,
1364                         const struct sk_buff *skb)
1365 {
1366         struct tcp_md5sig_pool *hp;
1367         struct ahash_request *req;
1368         const struct tcphdr *th = tcp_hdr(skb);
1369         __be32 saddr, daddr;
1370
1371         if (sk) { /* valid for establish/request sockets */
1372                 saddr = sk->sk_rcv_saddr;
1373                 daddr = sk->sk_daddr;
1374         } else {
1375                 const struct iphdr *iph = ip_hdr(skb);
1376                 saddr = iph->saddr;
1377                 daddr = iph->daddr;
1378         }
1379
1380         hp = tcp_get_md5sig_pool();
1381         if (!hp)
1382                 goto clear_hash_noput;
1383         req = hp->md5_req;
1384
1385         if (crypto_ahash_init(req))
1386                 goto clear_hash;
1387
1388         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1389                 goto clear_hash;
1390         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1391                 goto clear_hash;
1392         if (tcp_md5_hash_key(hp, key))
1393                 goto clear_hash;
1394         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1395         if (crypto_ahash_final(req))
1396                 goto clear_hash;
1397
1398         tcp_put_md5sig_pool();
1399         return 0;
1400
1401 clear_hash:
1402         tcp_put_md5sig_pool();
1403 clear_hash_noput:
1404         memset(md5_hash, 0, 16);
1405         return 1;
1406 }
1407 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1408
1409 #endif
1410
1411 static void tcp_v4_init_req(struct request_sock *req,
1412                             const struct sock *sk_listener,
1413                             struct sk_buff *skb)
1414 {
1415         struct inet_request_sock *ireq = inet_rsk(req);
1416         struct net *net = sock_net(sk_listener);
1417
1418         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1419         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1420         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1421 }
1422
1423 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1424                                           struct sk_buff *skb,
1425                                           struct flowi *fl,
1426                                           struct request_sock *req)
1427 {
1428         tcp_v4_init_req(req, sk, skb);
1429
1430         if (security_inet_conn_request(sk, skb, req))
1431                 return NULL;
1432
1433         return inet_csk_route_req(sk, &fl->u.ip4, req);
1434 }
1435
1436 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1437         .family         =       PF_INET,
1438         .obj_size       =       sizeof(struct tcp_request_sock),
1439         .rtx_syn_ack    =       tcp_rtx_synack,
1440         .send_ack       =       tcp_v4_reqsk_send_ack,
1441         .destructor     =       tcp_v4_reqsk_destructor,
1442         .send_reset     =       tcp_v4_send_reset,
1443         .syn_ack_timeout =      tcp_syn_ack_timeout,
1444 };
1445
1446 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1447         .mss_clamp      =       TCP_MSS_DEFAULT,
1448 #ifdef CONFIG_TCP_MD5SIG
1449         .req_md5_lookup =       tcp_v4_md5_lookup,
1450         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1451 #endif
1452 #ifdef CONFIG_SYN_COOKIES
1453         .cookie_init_seq =      cookie_v4_init_sequence,
1454 #endif
1455         .route_req      =       tcp_v4_route_req,
1456         .init_seq       =       tcp_v4_init_seq,
1457         .init_ts_off    =       tcp_v4_init_ts_off,
1458         .send_synack    =       tcp_v4_send_synack,
1459 };
1460
1461 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1462 {
1463         /* Never answer to SYNs send to broadcast or multicast */
1464         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1465                 goto drop;
1466
1467         return tcp_conn_request(&tcp_request_sock_ops,
1468                                 &tcp_request_sock_ipv4_ops, sk, skb);
1469
1470 drop:
1471         tcp_listendrop(sk);
1472         return 0;
1473 }
1474 EXPORT_SYMBOL(tcp_v4_conn_request);
1475
1476
1477 /*
1478  * The three way handshake has completed - we got a valid synack -
1479  * now create the new socket.
1480  */
1481 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1482                                   struct request_sock *req,
1483                                   struct dst_entry *dst,
1484                                   struct request_sock *req_unhash,
1485                                   bool *own_req)
1486 {
1487         struct inet_request_sock *ireq;
1488         bool found_dup_sk = false;
1489         struct inet_sock *newinet;
1490         struct tcp_sock *newtp;
1491         struct sock *newsk;
1492 #ifdef CONFIG_TCP_MD5SIG
1493         const union tcp_md5_addr *addr;
1494         struct tcp_md5sig_key *key;
1495         int l3index;
1496 #endif
1497         struct ip_options_rcu *inet_opt;
1498
1499         if (sk_acceptq_is_full(sk))
1500                 goto exit_overflow;
1501
1502         newsk = tcp_create_openreq_child(sk, req, skb);
1503         if (!newsk)
1504                 goto exit_nonewsk;
1505
1506         newsk->sk_gso_type = SKB_GSO_TCPV4;
1507         inet_sk_rx_dst_set(newsk, skb);
1508
1509         newtp                 = tcp_sk(newsk);
1510         newinet               = inet_sk(newsk);
1511         ireq                  = inet_rsk(req);
1512         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1513         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1514         newsk->sk_bound_dev_if = ireq->ir_iif;
1515         newinet->inet_saddr   = ireq->ir_loc_addr;
1516         inet_opt              = rcu_dereference(ireq->ireq_opt);
1517         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1518         newinet->mc_index     = inet_iif(skb);
1519         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1520         newinet->rcv_tos      = ip_hdr(skb)->tos;
1521         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1522         if (inet_opt)
1523                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1524         newinet->inet_id = prandom_u32();
1525
1526         /* Set ToS of the new socket based upon the value of incoming SYN.
1527          * ECT bits are set later in tcp_init_transfer().
1528          */
1529         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1530                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1531
1532         if (!dst) {
1533                 dst = inet_csk_route_child_sock(sk, newsk, req);
1534                 if (!dst)
1535                         goto put_and_exit;
1536         } else {
1537                 /* syncookie case : see end of cookie_v4_check() */
1538         }
1539         sk_setup_caps(newsk, dst);
1540
1541         tcp_ca_openreq_child(newsk, dst);
1542
1543         tcp_sync_mss(newsk, dst_mtu(dst));
1544         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545
1546         tcp_initialize_rcv_mss(newsk);
1547
1548 #ifdef CONFIG_TCP_MD5SIG
1549         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550         /* Copy over the MD5 key from the original socket */
1551         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1553         if (key) {
1554                 /*
1555                  * We're using one, so create a matching key
1556                  * on the newsk structure. If we fail to get
1557                  * memory, then we end up not copying the key
1558                  * across. Shucks.
1559                  */
1560                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1561                                key->key, key->keylen, GFP_ATOMIC);
1562                 sk_gso_disable(newsk);
1563         }
1564 #endif
1565
1566         if (__inet_inherit_port(sk, newsk) < 0)
1567                 goto put_and_exit;
1568         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1569                                        &found_dup_sk);
1570         if (likely(*own_req)) {
1571                 tcp_move_syn(newtp, req);
1572                 ireq->ireq_opt = NULL;
1573         } else {
1574                 newinet->inet_opt = NULL;
1575
1576                 if (!req_unhash && found_dup_sk) {
1577                         /* This code path should only be executed in the
1578                          * syncookie case only
1579                          */
1580                         bh_unlock_sock(newsk);
1581                         sock_put(newsk);
1582                         newsk = NULL;
1583                 }
1584         }
1585         return newsk;
1586
1587 exit_overflow:
1588         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1589 exit_nonewsk:
1590         dst_release(dst);
1591 exit:
1592         tcp_listendrop(sk);
1593         return NULL;
1594 put_and_exit:
1595         newinet->inet_opt = NULL;
1596         inet_csk_prepare_forced_close(newsk);
1597         tcp_done(newsk);
1598         goto exit;
1599 }
1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1601
1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1603 {
1604 #ifdef CONFIG_SYN_COOKIES
1605         const struct tcphdr *th = tcp_hdr(skb);
1606
1607         if (!th->syn)
1608                 sk = cookie_v4_check(sk, skb);
1609 #endif
1610         return sk;
1611 }
1612
1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614                          struct tcphdr *th, u32 *cookie)
1615 {
1616         u16 mss = 0;
1617 #ifdef CONFIG_SYN_COOKIES
1618         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619                                     &tcp_request_sock_ipv4_ops, sk, th);
1620         if (mss) {
1621                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622                 tcp_synq_overflow(sk);
1623         }
1624 #endif
1625         return mss;
1626 }
1627
1628 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1629                                                            u32));
1630 /* The socket must have it's spinlock held when we get
1631  * here, unless it is a TCP_LISTEN socket.
1632  *
1633  * We have a potential double-lock case here, so even when
1634  * doing backlog processing we use the BH locking scheme.
1635  * This is because we cannot sleep with the original spinlock
1636  * held.
1637  */
1638 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1639 {
1640         enum skb_drop_reason reason;
1641         struct sock *rsk;
1642
1643         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1644                 struct dst_entry *dst;
1645
1646                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1647                                                 lockdep_sock_is_held(sk));
1648
1649                 sock_rps_save_rxhash(sk, skb);
1650                 sk_mark_napi_id(sk, skb);
1651                 if (dst) {
1652                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1653                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1654                                              dst, 0)) {
1655                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1656                                 dst_release(dst);
1657                         }
1658                 }
1659                 tcp_rcv_established(sk, skb);
1660                 return 0;
1661         }
1662
1663         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1664         if (tcp_checksum_complete(skb))
1665                 goto csum_err;
1666
1667         if (sk->sk_state == TCP_LISTEN) {
1668                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1669
1670                 if (!nsk)
1671                         goto discard;
1672                 if (nsk != sk) {
1673                         if (tcp_child_process(sk, nsk, skb)) {
1674                                 rsk = nsk;
1675                                 goto reset;
1676                         }
1677                         return 0;
1678                 }
1679         } else
1680                 sock_rps_save_rxhash(sk, skb);
1681
1682         if (tcp_rcv_state_process(sk, skb)) {
1683                 rsk = sk;
1684                 goto reset;
1685         }
1686         return 0;
1687
1688 reset:
1689         tcp_v4_send_reset(rsk, skb);
1690 discard:
1691         kfree_skb_reason(skb, reason);
1692         /* Be careful here. If this function gets more complicated and
1693          * gcc suffers from register pressure on the x86, sk (in %ebx)
1694          * might be destroyed here. This current version compiles correctly,
1695          * but you have been warned.
1696          */
1697         return 0;
1698
1699 csum_err:
1700         reason = SKB_DROP_REASON_TCP_CSUM;
1701         trace_tcp_bad_csum(skb);
1702         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1703         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1704         goto discard;
1705 }
1706 EXPORT_SYMBOL(tcp_v4_do_rcv);
1707
1708 int tcp_v4_early_demux(struct sk_buff *skb)
1709 {
1710         const struct iphdr *iph;
1711         const struct tcphdr *th;
1712         struct sock *sk;
1713
1714         if (skb->pkt_type != PACKET_HOST)
1715                 return 0;
1716
1717         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1718                 return 0;
1719
1720         iph = ip_hdr(skb);
1721         th = tcp_hdr(skb);
1722
1723         if (th->doff < sizeof(struct tcphdr) / 4)
1724                 return 0;
1725
1726         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1727                                        iph->saddr, th->source,
1728                                        iph->daddr, ntohs(th->dest),
1729                                        skb->skb_iif, inet_sdif(skb));
1730         if (sk) {
1731                 skb->sk = sk;
1732                 skb->destructor = sock_edemux;
1733                 if (sk_fullsock(sk)) {
1734                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1735
1736                         if (dst)
1737                                 dst = dst_check(dst, 0);
1738                         if (dst &&
1739                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1740                                 skb_dst_set_noref(skb, dst);
1741                 }
1742         }
1743         return 0;
1744 }
1745
1746 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1747                      enum skb_drop_reason *reason)
1748 {
1749         u32 limit, tail_gso_size, tail_gso_segs;
1750         struct skb_shared_info *shinfo;
1751         const struct tcphdr *th;
1752         struct tcphdr *thtail;
1753         struct sk_buff *tail;
1754         unsigned int hdrlen;
1755         bool fragstolen;
1756         u32 gso_segs;
1757         u32 gso_size;
1758         int delta;
1759
1760         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1761          * we can fix skb->truesize to its real value to avoid future drops.
1762          * This is valid because skb is not yet charged to the socket.
1763          * It has been noticed pure SACK packets were sometimes dropped
1764          * (if cooked by drivers without copybreak feature).
1765          */
1766         skb_condense(skb);
1767
1768         skb_dst_drop(skb);
1769
1770         if (unlikely(tcp_checksum_complete(skb))) {
1771                 bh_unlock_sock(sk);
1772                 trace_tcp_bad_csum(skb);
1773                 *reason = SKB_DROP_REASON_TCP_CSUM;
1774                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1775                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1776                 return true;
1777         }
1778
1779         /* Attempt coalescing to last skb in backlog, even if we are
1780          * above the limits.
1781          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1782          */
1783         th = (const struct tcphdr *)skb->data;
1784         hdrlen = th->doff * 4;
1785
1786         tail = sk->sk_backlog.tail;
1787         if (!tail)
1788                 goto no_coalesce;
1789         thtail = (struct tcphdr *)tail->data;
1790
1791         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1792             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1793             ((TCP_SKB_CB(tail)->tcp_flags |
1794               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1795             !((TCP_SKB_CB(tail)->tcp_flags &
1796               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1797             ((TCP_SKB_CB(tail)->tcp_flags ^
1798               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1799 #ifdef CONFIG_TLS_DEVICE
1800             tail->decrypted != skb->decrypted ||
1801 #endif
1802             thtail->doff != th->doff ||
1803             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1804                 goto no_coalesce;
1805
1806         __skb_pull(skb, hdrlen);
1807
1808         shinfo = skb_shinfo(skb);
1809         gso_size = shinfo->gso_size ?: skb->len;
1810         gso_segs = shinfo->gso_segs ?: 1;
1811
1812         shinfo = skb_shinfo(tail);
1813         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1814         tail_gso_segs = shinfo->gso_segs ?: 1;
1815
1816         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1818
1819                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821                         thtail->window = th->window;
1822                 }
1823
1824                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825                  * thtail->fin, so that the fast path in tcp_rcv_established()
1826                  * is not entered if we append a packet with a FIN.
1827                  * SYN, RST, URG are not present.
1828                  * ACK is set on both packets.
1829                  * PSH : we do not really care in TCP stack,
1830                  *       at least for 'GRO' packets.
1831                  */
1832                 thtail->fin |= th->fin;
1833                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1834
1835                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1837                         tail->tstamp = skb->tstamp;
1838                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1839                 }
1840
1841                 /* Not as strict as GRO. We only need to carry mss max value */
1842                 shinfo->gso_size = max(gso_size, tail_gso_size);
1843                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1844
1845                 sk->sk_backlog.len += delta;
1846                 __NET_INC_STATS(sock_net(sk),
1847                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1848                 kfree_skb_partial(skb, fragstolen);
1849                 return false;
1850         }
1851         __skb_push(skb, hdrlen);
1852
1853 no_coalesce:
1854         /* Only socket owner can try to collapse/prune rx queues
1855          * to reduce memory overhead, so add a little headroom here.
1856          * Few sockets backlog are possibly concurrently non empty.
1857          */
1858         limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1859
1860         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1861                 bh_unlock_sock(sk);
1862                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1863                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1864                 return true;
1865         }
1866         return false;
1867 }
1868 EXPORT_SYMBOL(tcp_add_backlog);
1869
1870 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1871 {
1872         struct tcphdr *th = (struct tcphdr *)skb->data;
1873
1874         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1875 }
1876 EXPORT_SYMBOL(tcp_filter);
1877
1878 static void tcp_v4_restore_cb(struct sk_buff *skb)
1879 {
1880         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1881                 sizeof(struct inet_skb_parm));
1882 }
1883
1884 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1885                            const struct tcphdr *th)
1886 {
1887         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1888          * barrier() makes sure compiler wont play fool^Waliasing games.
1889          */
1890         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1891                 sizeof(struct inet_skb_parm));
1892         barrier();
1893
1894         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1895         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1896                                     skb->len - th->doff * 4);
1897         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1898         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1899         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1900         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1901         TCP_SKB_CB(skb)->sacked  = 0;
1902         TCP_SKB_CB(skb)->has_rxtstamp =
1903                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1904 }
1905
1906 /*
1907  *      From tcp_input.c
1908  */
1909
1910 int tcp_v4_rcv(struct sk_buff *skb)
1911 {
1912         struct net *net = dev_net(skb->dev);
1913         enum skb_drop_reason drop_reason;
1914         int sdif = inet_sdif(skb);
1915         int dif = inet_iif(skb);
1916         const struct iphdr *iph;
1917         const struct tcphdr *th;
1918         bool refcounted;
1919         struct sock *sk;
1920         int ret;
1921
1922         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1923         if (skb->pkt_type != PACKET_HOST)
1924                 goto discard_it;
1925
1926         /* Count it even if it's bad */
1927         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1928
1929         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1930                 goto discard_it;
1931
1932         th = (const struct tcphdr *)skb->data;
1933
1934         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1935                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1936                 goto bad_packet;
1937         }
1938         if (!pskb_may_pull(skb, th->doff * 4))
1939                 goto discard_it;
1940
1941         /* An explanation is required here, I think.
1942          * Packet length and doff are validated by header prediction,
1943          * provided case of th->doff==0 is eliminated.
1944          * So, we defer the checks. */
1945
1946         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1947                 goto csum_error;
1948
1949         th = (const struct tcphdr *)skb->data;
1950         iph = ip_hdr(skb);
1951 lookup:
1952         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1953                                th->dest, sdif, &refcounted);
1954         if (!sk)
1955                 goto no_tcp_socket;
1956
1957 process:
1958         if (sk->sk_state == TCP_TIME_WAIT)
1959                 goto do_time_wait;
1960
1961         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1962                 struct request_sock *req = inet_reqsk(sk);
1963                 bool req_stolen = false;
1964                 struct sock *nsk;
1965
1966                 sk = req->rsk_listener;
1967                 drop_reason = tcp_inbound_md5_hash(sk, skb,
1968                                                    &iph->saddr, &iph->daddr,
1969                                                    AF_INET, dif, sdif);
1970                 if (unlikely(drop_reason)) {
1971                         sk_drops_add(sk, skb);
1972                         reqsk_put(req);
1973                         goto discard_it;
1974                 }
1975                 if (tcp_checksum_complete(skb)) {
1976                         reqsk_put(req);
1977                         goto csum_error;
1978                 }
1979                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1980                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1981                         if (!nsk) {
1982                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
1983                                 goto lookup;
1984                         }
1985                         sk = nsk;
1986                         /* reuseport_migrate_sock() has already held one sk_refcnt
1987                          * before returning.
1988                          */
1989                 } else {
1990                         /* We own a reference on the listener, increase it again
1991                          * as we might lose it too soon.
1992                          */
1993                         sock_hold(sk);
1994                 }
1995                 refcounted = true;
1996                 nsk = NULL;
1997                 if (!tcp_filter(sk, skb)) {
1998                         th = (const struct tcphdr *)skb->data;
1999                         iph = ip_hdr(skb);
2000                         tcp_v4_fill_cb(skb, iph, th);
2001                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2002                 } else {
2003                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2004                 }
2005                 if (!nsk) {
2006                         reqsk_put(req);
2007                         if (req_stolen) {
2008                                 /* Another cpu got exclusive access to req
2009                                  * and created a full blown socket.
2010                                  * Try to feed this packet to this socket
2011                                  * instead of discarding it.
2012                                  */
2013                                 tcp_v4_restore_cb(skb);
2014                                 sock_put(sk);
2015                                 goto lookup;
2016                         }
2017                         goto discard_and_relse;
2018                 }
2019                 if (nsk == sk) {
2020                         reqsk_put(req);
2021                         tcp_v4_restore_cb(skb);
2022                 } else if (tcp_child_process(sk, nsk, skb)) {
2023                         tcp_v4_send_reset(nsk, skb);
2024                         goto discard_and_relse;
2025                 } else {
2026                         sock_put(sk);
2027                         return 0;
2028                 }
2029         }
2030
2031         if (static_branch_unlikely(&ip4_min_ttl)) {
2032                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2033                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2034                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2035                         goto discard_and_relse;
2036                 }
2037         }
2038
2039         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2040                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2041                 goto discard_and_relse;
2042         }
2043
2044         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2045                                            &iph->daddr, AF_INET, dif, sdif);
2046         if (drop_reason)
2047                 goto discard_and_relse;
2048
2049         nf_reset_ct(skb);
2050
2051         if (tcp_filter(sk, skb)) {
2052                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2053                 goto discard_and_relse;
2054         }
2055         th = (const struct tcphdr *)skb->data;
2056         iph = ip_hdr(skb);
2057         tcp_v4_fill_cb(skb, iph, th);
2058
2059         skb->dev = NULL;
2060
2061         if (sk->sk_state == TCP_LISTEN) {
2062                 ret = tcp_v4_do_rcv(sk, skb);
2063                 goto put_and_return;
2064         }
2065
2066         sk_incoming_cpu_update(sk);
2067
2068         bh_lock_sock_nested(sk);
2069         tcp_segs_in(tcp_sk(sk), skb);
2070         ret = 0;
2071         if (!sock_owned_by_user(sk)) {
2072                 ret = tcp_v4_do_rcv(sk, skb);
2073         } else {
2074                 if (tcp_add_backlog(sk, skb, &drop_reason))
2075                         goto discard_and_relse;
2076         }
2077         bh_unlock_sock(sk);
2078
2079 put_and_return:
2080         if (refcounted)
2081                 sock_put(sk);
2082
2083         return ret;
2084
2085 no_tcp_socket:
2086         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2087         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2088                 goto discard_it;
2089
2090         tcp_v4_fill_cb(skb, iph, th);
2091
2092         if (tcp_checksum_complete(skb)) {
2093 csum_error:
2094                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2095                 trace_tcp_bad_csum(skb);
2096                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2097 bad_packet:
2098                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2099         } else {
2100                 tcp_v4_send_reset(NULL, skb);
2101         }
2102
2103 discard_it:
2104         /* Discard frame. */
2105         kfree_skb_reason(skb, drop_reason);
2106         return 0;
2107
2108 discard_and_relse:
2109         sk_drops_add(sk, skb);
2110         if (refcounted)
2111                 sock_put(sk);
2112         goto discard_it;
2113
2114 do_time_wait:
2115         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2116                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2117                 inet_twsk_put(inet_twsk(sk));
2118                 goto discard_it;
2119         }
2120
2121         tcp_v4_fill_cb(skb, iph, th);
2122
2123         if (tcp_checksum_complete(skb)) {
2124                 inet_twsk_put(inet_twsk(sk));
2125                 goto csum_error;
2126         }
2127         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2128         case TCP_TW_SYN: {
2129                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2130                                                         &tcp_hashinfo, skb,
2131                                                         __tcp_hdrlen(th),
2132                                                         iph->saddr, th->source,
2133                                                         iph->daddr, th->dest,
2134                                                         inet_iif(skb),
2135                                                         sdif);
2136                 if (sk2) {
2137                         inet_twsk_deschedule_put(inet_twsk(sk));
2138                         sk = sk2;
2139                         tcp_v4_restore_cb(skb);
2140                         refcounted = false;
2141                         goto process;
2142                 }
2143         }
2144                 /* to ACK */
2145                 fallthrough;
2146         case TCP_TW_ACK:
2147                 tcp_v4_timewait_ack(sk, skb);
2148                 break;
2149         case TCP_TW_RST:
2150                 tcp_v4_send_reset(sk, skb);
2151                 inet_twsk_deschedule_put(inet_twsk(sk));
2152                 goto discard_it;
2153         case TCP_TW_SUCCESS:;
2154         }
2155         goto discard_it;
2156 }
2157
2158 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2159         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2160         .twsk_unique    = tcp_twsk_unique,
2161         .twsk_destructor= tcp_twsk_destructor,
2162 };
2163
2164 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2165 {
2166         struct dst_entry *dst = skb_dst(skb);
2167
2168         if (dst && dst_hold_safe(dst)) {
2169                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2170                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2171         }
2172 }
2173 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2174
2175 const struct inet_connection_sock_af_ops ipv4_specific = {
2176         .queue_xmit        = ip_queue_xmit,
2177         .send_check        = tcp_v4_send_check,
2178         .rebuild_header    = inet_sk_rebuild_header,
2179         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2180         .conn_request      = tcp_v4_conn_request,
2181         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2182         .net_header_len    = sizeof(struct iphdr),
2183         .setsockopt        = ip_setsockopt,
2184         .getsockopt        = ip_getsockopt,
2185         .addr2sockaddr     = inet_csk_addr2sockaddr,
2186         .sockaddr_len      = sizeof(struct sockaddr_in),
2187         .mtu_reduced       = tcp_v4_mtu_reduced,
2188 };
2189 EXPORT_SYMBOL(ipv4_specific);
2190
2191 #ifdef CONFIG_TCP_MD5SIG
2192 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2193         .md5_lookup             = tcp_v4_md5_lookup,
2194         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2195         .md5_parse              = tcp_v4_parse_md5_keys,
2196 };
2197 #endif
2198
2199 /* NOTE: A lot of things set to zero explicitly by call to
2200  *       sk_alloc() so need not be done here.
2201  */
2202 static int tcp_v4_init_sock(struct sock *sk)
2203 {
2204         struct inet_connection_sock *icsk = inet_csk(sk);
2205
2206         tcp_init_sock(sk);
2207
2208         icsk->icsk_af_ops = &ipv4_specific;
2209
2210 #ifdef CONFIG_TCP_MD5SIG
2211         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2212 #endif
2213
2214         return 0;
2215 }
2216
2217 void tcp_v4_destroy_sock(struct sock *sk)
2218 {
2219         struct tcp_sock *tp = tcp_sk(sk);
2220
2221         trace_tcp_destroy_sock(sk);
2222
2223         tcp_clear_xmit_timers(sk);
2224
2225         tcp_cleanup_congestion_control(sk);
2226
2227         tcp_cleanup_ulp(sk);
2228
2229         /* Cleanup up the write buffer. */
2230         tcp_write_queue_purge(sk);
2231
2232         /* Check if we want to disable active TFO */
2233         tcp_fastopen_active_disable_ofo_check(sk);
2234
2235         /* Cleans up our, hopefully empty, out_of_order_queue. */
2236         skb_rbtree_purge(&tp->out_of_order_queue);
2237
2238 #ifdef CONFIG_TCP_MD5SIG
2239         /* Clean up the MD5 key list, if any */
2240         if (tp->md5sig_info) {
2241                 tcp_clear_md5_list(sk);
2242                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2243                 tp->md5sig_info = NULL;
2244         }
2245 #endif
2246
2247         /* Clean up a referenced TCP bind bucket. */
2248         if (inet_csk(sk)->icsk_bind_hash)
2249                 inet_put_port(sk);
2250
2251         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2252
2253         /* If socket is aborted during connect operation */
2254         tcp_free_fastopen_req(tp);
2255         tcp_fastopen_destroy_cipher(sk);
2256         tcp_saved_syn_free(tp);
2257
2258         sk_sockets_allocated_dec(sk);
2259 }
2260 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2261
2262 #ifdef CONFIG_PROC_FS
2263 /* Proc filesystem TCP sock list dumping. */
2264
2265 static unsigned short seq_file_family(const struct seq_file *seq);
2266
2267 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2268 {
2269         unsigned short family = seq_file_family(seq);
2270
2271         /* AF_UNSPEC is used as a match all */
2272         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2273                 net_eq(sock_net(sk), seq_file_net(seq)));
2274 }
2275
2276 /* Find a non empty bucket (starting from st->bucket)
2277  * and return the first sk from it.
2278  */
2279 static void *listening_get_first(struct seq_file *seq)
2280 {
2281         struct tcp_iter_state *st = seq->private;
2282
2283         st->offset = 0;
2284         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2285                 struct inet_listen_hashbucket *ilb2;
2286                 struct hlist_nulls_node *node;
2287                 struct sock *sk;
2288
2289                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2290                 if (hlist_nulls_empty(&ilb2->nulls_head))
2291                         continue;
2292
2293                 spin_lock(&ilb2->lock);
2294                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2295                         if (seq_sk_match(seq, sk))
2296                                 return sk;
2297                 }
2298                 spin_unlock(&ilb2->lock);
2299         }
2300
2301         return NULL;
2302 }
2303
2304 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2305  * If "cur" is the last one in the st->bucket,
2306  * call listening_get_first() to return the first sk of the next
2307  * non empty bucket.
2308  */
2309 static void *listening_get_next(struct seq_file *seq, void *cur)
2310 {
2311         struct tcp_iter_state *st = seq->private;
2312         struct inet_listen_hashbucket *ilb2;
2313         struct hlist_nulls_node *node;
2314         struct sock *sk = cur;
2315
2316         ++st->num;
2317         ++st->offset;
2318
2319         sk = sk_nulls_next(sk);
2320         sk_nulls_for_each_from(sk, node) {
2321                 if (seq_sk_match(seq, sk))
2322                         return sk;
2323         }
2324
2325         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2326         spin_unlock(&ilb2->lock);
2327         ++st->bucket;
2328         return listening_get_first(seq);
2329 }
2330
2331 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2332 {
2333         struct tcp_iter_state *st = seq->private;
2334         void *rc;
2335
2336         st->bucket = 0;
2337         st->offset = 0;
2338         rc = listening_get_first(seq);
2339
2340         while (rc && *pos) {
2341                 rc = listening_get_next(seq, rc);
2342                 --*pos;
2343         }
2344         return rc;
2345 }
2346
2347 static inline bool empty_bucket(const struct tcp_iter_state *st)
2348 {
2349         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2350 }
2351
2352 /*
2353  * Get first established socket starting from bucket given in st->bucket.
2354  * If st->bucket is zero, the very first socket in the hash is returned.
2355  */
2356 static void *established_get_first(struct seq_file *seq)
2357 {
2358         struct tcp_iter_state *st = seq->private;
2359
2360         st->offset = 0;
2361         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2362                 struct sock *sk;
2363                 struct hlist_nulls_node *node;
2364                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2365
2366                 /* Lockless fast path for the common case of empty buckets */
2367                 if (empty_bucket(st))
2368                         continue;
2369
2370                 spin_lock_bh(lock);
2371                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2372                         if (seq_sk_match(seq, sk))
2373                                 return sk;
2374                 }
2375                 spin_unlock_bh(lock);
2376         }
2377
2378         return NULL;
2379 }
2380
2381 static void *established_get_next(struct seq_file *seq, void *cur)
2382 {
2383         struct sock *sk = cur;
2384         struct hlist_nulls_node *node;
2385         struct tcp_iter_state *st = seq->private;
2386
2387         ++st->num;
2388         ++st->offset;
2389
2390         sk = sk_nulls_next(sk);
2391
2392         sk_nulls_for_each_from(sk, node) {
2393                 if (seq_sk_match(seq, sk))
2394                         return sk;
2395         }
2396
2397         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2398         ++st->bucket;
2399         return established_get_first(seq);
2400 }
2401
2402 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2403 {
2404         struct tcp_iter_state *st = seq->private;
2405         void *rc;
2406
2407         st->bucket = 0;
2408         rc = established_get_first(seq);
2409
2410         while (rc && pos) {
2411                 rc = established_get_next(seq, rc);
2412                 --pos;
2413         }
2414         return rc;
2415 }
2416
2417 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2418 {
2419         void *rc;
2420         struct tcp_iter_state *st = seq->private;
2421
2422         st->state = TCP_SEQ_STATE_LISTENING;
2423         rc        = listening_get_idx(seq, &pos);
2424
2425         if (!rc) {
2426                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2427                 rc        = established_get_idx(seq, pos);
2428         }
2429
2430         return rc;
2431 }
2432
2433 static void *tcp_seek_last_pos(struct seq_file *seq)
2434 {
2435         struct tcp_iter_state *st = seq->private;
2436         int bucket = st->bucket;
2437         int offset = st->offset;
2438         int orig_num = st->num;
2439         void *rc = NULL;
2440
2441         switch (st->state) {
2442         case TCP_SEQ_STATE_LISTENING:
2443                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2444                         break;
2445                 st->state = TCP_SEQ_STATE_LISTENING;
2446                 rc = listening_get_first(seq);
2447                 while (offset-- && rc && bucket == st->bucket)
2448                         rc = listening_get_next(seq, rc);
2449                 if (rc)
2450                         break;
2451                 st->bucket = 0;
2452                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2453                 fallthrough;
2454         case TCP_SEQ_STATE_ESTABLISHED:
2455                 if (st->bucket > tcp_hashinfo.ehash_mask)
2456                         break;
2457                 rc = established_get_first(seq);
2458                 while (offset-- && rc && bucket == st->bucket)
2459                         rc = established_get_next(seq, rc);
2460         }
2461
2462         st->num = orig_num;
2463
2464         return rc;
2465 }
2466
2467 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2468 {
2469         struct tcp_iter_state *st = seq->private;
2470         void *rc;
2471
2472         if (*pos && *pos == st->last_pos) {
2473                 rc = tcp_seek_last_pos(seq);
2474                 if (rc)
2475                         goto out;
2476         }
2477
2478         st->state = TCP_SEQ_STATE_LISTENING;
2479         st->num = 0;
2480         st->bucket = 0;
2481         st->offset = 0;
2482         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2483
2484 out:
2485         st->last_pos = *pos;
2486         return rc;
2487 }
2488 EXPORT_SYMBOL(tcp_seq_start);
2489
2490 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2491 {
2492         struct tcp_iter_state *st = seq->private;
2493         void *rc = NULL;
2494
2495         if (v == SEQ_START_TOKEN) {
2496                 rc = tcp_get_idx(seq, 0);
2497                 goto out;
2498         }
2499
2500         switch (st->state) {
2501         case TCP_SEQ_STATE_LISTENING:
2502                 rc = listening_get_next(seq, v);
2503                 if (!rc) {
2504                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2505                         st->bucket = 0;
2506                         st->offset = 0;
2507                         rc        = established_get_first(seq);
2508                 }
2509                 break;
2510         case TCP_SEQ_STATE_ESTABLISHED:
2511                 rc = established_get_next(seq, v);
2512                 break;
2513         }
2514 out:
2515         ++*pos;
2516         st->last_pos = *pos;
2517         return rc;
2518 }
2519 EXPORT_SYMBOL(tcp_seq_next);
2520
2521 void tcp_seq_stop(struct seq_file *seq, void *v)
2522 {
2523         struct tcp_iter_state *st = seq->private;
2524
2525         switch (st->state) {
2526         case TCP_SEQ_STATE_LISTENING:
2527                 if (v != SEQ_START_TOKEN)
2528                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2529                 break;
2530         case TCP_SEQ_STATE_ESTABLISHED:
2531                 if (v)
2532                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2533                 break;
2534         }
2535 }
2536 EXPORT_SYMBOL(tcp_seq_stop);
2537
2538 static void get_openreq4(const struct request_sock *req,
2539                          struct seq_file *f, int i)
2540 {
2541         const struct inet_request_sock *ireq = inet_rsk(req);
2542         long delta = req->rsk_timer.expires - jiffies;
2543
2544         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2545                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2546                 i,
2547                 ireq->ir_loc_addr,
2548                 ireq->ir_num,
2549                 ireq->ir_rmt_addr,
2550                 ntohs(ireq->ir_rmt_port),
2551                 TCP_SYN_RECV,
2552                 0, 0, /* could print option size, but that is af dependent. */
2553                 1,    /* timers active (only the expire timer) */
2554                 jiffies_delta_to_clock_t(delta),
2555                 req->num_timeout,
2556                 from_kuid_munged(seq_user_ns(f),
2557                                  sock_i_uid(req->rsk_listener)),
2558                 0,  /* non standard timer */
2559                 0, /* open_requests have no inode */
2560                 0,
2561                 req);
2562 }
2563
2564 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2565 {
2566         int timer_active;
2567         unsigned long timer_expires;
2568         const struct tcp_sock *tp = tcp_sk(sk);
2569         const struct inet_connection_sock *icsk = inet_csk(sk);
2570         const struct inet_sock *inet = inet_sk(sk);
2571         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2572         __be32 dest = inet->inet_daddr;
2573         __be32 src = inet->inet_rcv_saddr;
2574         __u16 destp = ntohs(inet->inet_dport);
2575         __u16 srcp = ntohs(inet->inet_sport);
2576         int rx_queue;
2577         int state;
2578
2579         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2580             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2581             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2582                 timer_active    = 1;
2583                 timer_expires   = icsk->icsk_timeout;
2584         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2585                 timer_active    = 4;
2586                 timer_expires   = icsk->icsk_timeout;
2587         } else if (timer_pending(&sk->sk_timer)) {
2588                 timer_active    = 2;
2589                 timer_expires   = sk->sk_timer.expires;
2590         } else {
2591                 timer_active    = 0;
2592                 timer_expires = jiffies;
2593         }
2594
2595         state = inet_sk_state_load(sk);
2596         if (state == TCP_LISTEN)
2597                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2598         else
2599                 /* Because we don't lock the socket,
2600                  * we might find a transient negative value.
2601                  */
2602                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2603                                       READ_ONCE(tp->copied_seq), 0);
2604
2605         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2606                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2607                 i, src, srcp, dest, destp, state,
2608                 READ_ONCE(tp->write_seq) - tp->snd_una,
2609                 rx_queue,
2610                 timer_active,
2611                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2612                 icsk->icsk_retransmits,
2613                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2614                 icsk->icsk_probes_out,
2615                 sock_i_ino(sk),
2616                 refcount_read(&sk->sk_refcnt), sk,
2617                 jiffies_to_clock_t(icsk->icsk_rto),
2618                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2619                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2620                 tcp_snd_cwnd(tp),
2621                 state == TCP_LISTEN ?
2622                     fastopenq->max_qlen :
2623                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2624 }
2625
2626 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2627                                struct seq_file *f, int i)
2628 {
2629         long delta = tw->tw_timer.expires - jiffies;
2630         __be32 dest, src;
2631         __u16 destp, srcp;
2632
2633         dest  = tw->tw_daddr;
2634         src   = tw->tw_rcv_saddr;
2635         destp = ntohs(tw->tw_dport);
2636         srcp  = ntohs(tw->tw_sport);
2637
2638         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2639                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2640                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2641                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2642                 refcount_read(&tw->tw_refcnt), tw);
2643 }
2644
2645 #define TMPSZ 150
2646
2647 static int tcp4_seq_show(struct seq_file *seq, void *v)
2648 {
2649         struct tcp_iter_state *st;
2650         struct sock *sk = v;
2651
2652         seq_setwidth(seq, TMPSZ - 1);
2653         if (v == SEQ_START_TOKEN) {
2654                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2655                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2656                            "inode");
2657                 goto out;
2658         }
2659         st = seq->private;
2660
2661         if (sk->sk_state == TCP_TIME_WAIT)
2662                 get_timewait4_sock(v, seq, st->num);
2663         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2664                 get_openreq4(v, seq, st->num);
2665         else
2666                 get_tcp4_sock(v, seq, st->num);
2667 out:
2668         seq_pad(seq, '\n');
2669         return 0;
2670 }
2671
2672 #ifdef CONFIG_BPF_SYSCALL
2673 struct bpf_tcp_iter_state {
2674         struct tcp_iter_state state;
2675         unsigned int cur_sk;
2676         unsigned int end_sk;
2677         unsigned int max_sk;
2678         struct sock **batch;
2679         bool st_bucket_done;
2680 };
2681
2682 struct bpf_iter__tcp {
2683         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2684         __bpf_md_ptr(struct sock_common *, sk_common);
2685         uid_t uid __aligned(8);
2686 };
2687
2688 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2689                              struct sock_common *sk_common, uid_t uid)
2690 {
2691         struct bpf_iter__tcp ctx;
2692
2693         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2694         ctx.meta = meta;
2695         ctx.sk_common = sk_common;
2696         ctx.uid = uid;
2697         return bpf_iter_run_prog(prog, &ctx);
2698 }
2699
2700 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2701 {
2702         while (iter->cur_sk < iter->end_sk)
2703                 sock_put(iter->batch[iter->cur_sk++]);
2704 }
2705
2706 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2707                                       unsigned int new_batch_sz)
2708 {
2709         struct sock **new_batch;
2710
2711         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2712                              GFP_USER | __GFP_NOWARN);
2713         if (!new_batch)
2714                 return -ENOMEM;
2715
2716         bpf_iter_tcp_put_batch(iter);
2717         kvfree(iter->batch);
2718         iter->batch = new_batch;
2719         iter->max_sk = new_batch_sz;
2720
2721         return 0;
2722 }
2723
2724 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2725                                                  struct sock *start_sk)
2726 {
2727         struct bpf_tcp_iter_state *iter = seq->private;
2728         struct tcp_iter_state *st = &iter->state;
2729         struct hlist_nulls_node *node;
2730         unsigned int expected = 1;
2731         struct sock *sk;
2732
2733         sock_hold(start_sk);
2734         iter->batch[iter->end_sk++] = start_sk;
2735
2736         sk = sk_nulls_next(start_sk);
2737         sk_nulls_for_each_from(sk, node) {
2738                 if (seq_sk_match(seq, sk)) {
2739                         if (iter->end_sk < iter->max_sk) {
2740                                 sock_hold(sk);
2741                                 iter->batch[iter->end_sk++] = sk;
2742                         }
2743                         expected++;
2744                 }
2745         }
2746         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2747
2748         return expected;
2749 }
2750
2751 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2752                                                    struct sock *start_sk)
2753 {
2754         struct bpf_tcp_iter_state *iter = seq->private;
2755         struct tcp_iter_state *st = &iter->state;
2756         struct hlist_nulls_node *node;
2757         unsigned int expected = 1;
2758         struct sock *sk;
2759
2760         sock_hold(start_sk);
2761         iter->batch[iter->end_sk++] = start_sk;
2762
2763         sk = sk_nulls_next(start_sk);
2764         sk_nulls_for_each_from(sk, node) {
2765                 if (seq_sk_match(seq, sk)) {
2766                         if (iter->end_sk < iter->max_sk) {
2767                                 sock_hold(sk);
2768                                 iter->batch[iter->end_sk++] = sk;
2769                         }
2770                         expected++;
2771                 }
2772         }
2773         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2774
2775         return expected;
2776 }
2777
2778 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2779 {
2780         struct bpf_tcp_iter_state *iter = seq->private;
2781         struct tcp_iter_state *st = &iter->state;
2782         unsigned int expected;
2783         bool resized = false;
2784         struct sock *sk;
2785
2786         /* The st->bucket is done.  Directly advance to the next
2787          * bucket instead of having the tcp_seek_last_pos() to skip
2788          * one by one in the current bucket and eventually find out
2789          * it has to advance to the next bucket.
2790          */
2791         if (iter->st_bucket_done) {
2792                 st->offset = 0;
2793                 st->bucket++;
2794                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2795                     st->bucket > tcp_hashinfo.lhash2_mask) {
2796                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2797                         st->bucket = 0;
2798                 }
2799         }
2800
2801 again:
2802         /* Get a new batch */
2803         iter->cur_sk = 0;
2804         iter->end_sk = 0;
2805         iter->st_bucket_done = false;
2806
2807         sk = tcp_seek_last_pos(seq);
2808         if (!sk)
2809                 return NULL; /* Done */
2810
2811         if (st->state == TCP_SEQ_STATE_LISTENING)
2812                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2813         else
2814                 expected = bpf_iter_tcp_established_batch(seq, sk);
2815
2816         if (iter->end_sk == expected) {
2817                 iter->st_bucket_done = true;
2818                 return sk;
2819         }
2820
2821         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2822                 resized = true;
2823                 goto again;
2824         }
2825
2826         return sk;
2827 }
2828
2829 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2830 {
2831         /* bpf iter does not support lseek, so it always
2832          * continue from where it was stop()-ped.
2833          */
2834         if (*pos)
2835                 return bpf_iter_tcp_batch(seq);
2836
2837         return SEQ_START_TOKEN;
2838 }
2839
2840 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2841 {
2842         struct bpf_tcp_iter_state *iter = seq->private;
2843         struct tcp_iter_state *st = &iter->state;
2844         struct sock *sk;
2845
2846         /* Whenever seq_next() is called, the iter->cur_sk is
2847          * done with seq_show(), so advance to the next sk in
2848          * the batch.
2849          */
2850         if (iter->cur_sk < iter->end_sk) {
2851                 /* Keeping st->num consistent in tcp_iter_state.
2852                  * bpf_iter_tcp does not use st->num.
2853                  * meta.seq_num is used instead.
2854                  */
2855                 st->num++;
2856                 /* Move st->offset to the next sk in the bucket such that
2857                  * the future start() will resume at st->offset in
2858                  * st->bucket.  See tcp_seek_last_pos().
2859                  */
2860                 st->offset++;
2861                 sock_put(iter->batch[iter->cur_sk++]);
2862         }
2863
2864         if (iter->cur_sk < iter->end_sk)
2865                 sk = iter->batch[iter->cur_sk];
2866         else
2867                 sk = bpf_iter_tcp_batch(seq);
2868
2869         ++*pos;
2870         /* Keeping st->last_pos consistent in tcp_iter_state.
2871          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2872          */
2873         st->last_pos = *pos;
2874         return sk;
2875 }
2876
2877 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2878 {
2879         struct bpf_iter_meta meta;
2880         struct bpf_prog *prog;
2881         struct sock *sk = v;
2882         bool slow;
2883         uid_t uid;
2884         int ret;
2885
2886         if (v == SEQ_START_TOKEN)
2887                 return 0;
2888
2889         if (sk_fullsock(sk))
2890                 slow = lock_sock_fast(sk);
2891
2892         if (unlikely(sk_unhashed(sk))) {
2893                 ret = SEQ_SKIP;
2894                 goto unlock;
2895         }
2896
2897         if (sk->sk_state == TCP_TIME_WAIT) {
2898                 uid = 0;
2899         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2900                 const struct request_sock *req = v;
2901
2902                 uid = from_kuid_munged(seq_user_ns(seq),
2903                                        sock_i_uid(req->rsk_listener));
2904         } else {
2905                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2906         }
2907
2908         meta.seq = seq;
2909         prog = bpf_iter_get_info(&meta, false);
2910         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2911
2912 unlock:
2913         if (sk_fullsock(sk))
2914                 unlock_sock_fast(sk, slow);
2915         return ret;
2916
2917 }
2918
2919 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2920 {
2921         struct bpf_tcp_iter_state *iter = seq->private;
2922         struct bpf_iter_meta meta;
2923         struct bpf_prog *prog;
2924
2925         if (!v) {
2926                 meta.seq = seq;
2927                 prog = bpf_iter_get_info(&meta, true);
2928                 if (prog)
2929                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2930         }
2931
2932         if (iter->cur_sk < iter->end_sk) {
2933                 bpf_iter_tcp_put_batch(iter);
2934                 iter->st_bucket_done = false;
2935         }
2936 }
2937
2938 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2939         .show           = bpf_iter_tcp_seq_show,
2940         .start          = bpf_iter_tcp_seq_start,
2941         .next           = bpf_iter_tcp_seq_next,
2942         .stop           = bpf_iter_tcp_seq_stop,
2943 };
2944 #endif
2945 static unsigned short seq_file_family(const struct seq_file *seq)
2946 {
2947         const struct tcp_seq_afinfo *afinfo;
2948
2949 #ifdef CONFIG_BPF_SYSCALL
2950         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2951         if (seq->op == &bpf_iter_tcp_seq_ops)
2952                 return AF_UNSPEC;
2953 #endif
2954
2955         /* Iterated from proc fs */
2956         afinfo = pde_data(file_inode(seq->file));
2957         return afinfo->family;
2958 }
2959
2960 static const struct seq_operations tcp4_seq_ops = {
2961         .show           = tcp4_seq_show,
2962         .start          = tcp_seq_start,
2963         .next           = tcp_seq_next,
2964         .stop           = tcp_seq_stop,
2965 };
2966
2967 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2968         .family         = AF_INET,
2969 };
2970
2971 static int __net_init tcp4_proc_init_net(struct net *net)
2972 {
2973         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2974                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2975                 return -ENOMEM;
2976         return 0;
2977 }
2978
2979 static void __net_exit tcp4_proc_exit_net(struct net *net)
2980 {
2981         remove_proc_entry("tcp", net->proc_net);
2982 }
2983
2984 static struct pernet_operations tcp4_net_ops = {
2985         .init = tcp4_proc_init_net,
2986         .exit = tcp4_proc_exit_net,
2987 };
2988
2989 int __init tcp4_proc_init(void)
2990 {
2991         return register_pernet_subsys(&tcp4_net_ops);
2992 }
2993
2994 void tcp4_proc_exit(void)
2995 {
2996         unregister_pernet_subsys(&tcp4_net_ops);
2997 }
2998 #endif /* CONFIG_PROC_FS */
2999
3000 /* @wake is one when sk_stream_write_space() calls us.
3001  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3002  * This mimics the strategy used in sock_def_write_space().
3003  */
3004 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3005 {
3006         const struct tcp_sock *tp = tcp_sk(sk);
3007         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3008                             READ_ONCE(tp->snd_nxt);
3009
3010         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3011 }
3012 EXPORT_SYMBOL(tcp_stream_memory_free);
3013
3014 struct proto tcp_prot = {
3015         .name                   = "TCP",
3016         .owner                  = THIS_MODULE,
3017         .close                  = tcp_close,
3018         .pre_connect            = tcp_v4_pre_connect,
3019         .connect                = tcp_v4_connect,
3020         .disconnect             = tcp_disconnect,
3021         .accept                 = inet_csk_accept,
3022         .ioctl                  = tcp_ioctl,
3023         .init                   = tcp_v4_init_sock,
3024         .destroy                = tcp_v4_destroy_sock,
3025         .shutdown               = tcp_shutdown,
3026         .setsockopt             = tcp_setsockopt,
3027         .getsockopt             = tcp_getsockopt,
3028         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3029         .keepalive              = tcp_set_keepalive,
3030         .recvmsg                = tcp_recvmsg,
3031         .sendmsg                = tcp_sendmsg,
3032         .sendpage               = tcp_sendpage,
3033         .backlog_rcv            = tcp_v4_do_rcv,
3034         .release_cb             = tcp_release_cb,
3035         .hash                   = inet_hash,
3036         .unhash                 = inet_unhash,
3037         .get_port               = inet_csk_get_port,
3038         .put_port               = inet_put_port,
3039 #ifdef CONFIG_BPF_SYSCALL
3040         .psock_update_sk_prot   = tcp_bpf_update_proto,
3041 #endif
3042         .enter_memory_pressure  = tcp_enter_memory_pressure,
3043         .leave_memory_pressure  = tcp_leave_memory_pressure,
3044         .stream_memory_free     = tcp_stream_memory_free,
3045         .sockets_allocated      = &tcp_sockets_allocated,
3046         .orphan_count           = &tcp_orphan_count,
3047         .memory_allocated       = &tcp_memory_allocated,
3048         .memory_pressure        = &tcp_memory_pressure,
3049         .sysctl_mem             = sysctl_tcp_mem,
3050         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3051         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3052         .max_header             = MAX_TCP_HEADER,
3053         .obj_size               = sizeof(struct tcp_sock),
3054         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3055         .twsk_prot              = &tcp_timewait_sock_ops,
3056         .rsk_prot               = &tcp_request_sock_ops,
3057         .h.hashinfo             = &tcp_hashinfo,
3058         .no_autobind            = true,
3059         .diag_destroy           = tcp_abort,
3060 };
3061 EXPORT_SYMBOL(tcp_prot);
3062
3063 static void __net_exit tcp_sk_exit(struct net *net)
3064 {
3065         struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3066
3067         if (net->ipv4.tcp_congestion_control)
3068                 bpf_module_put(net->ipv4.tcp_congestion_control,
3069                                net->ipv4.tcp_congestion_control->owner);
3070         if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3071                 kfree(tcp_death_row);
3072 }
3073
3074 static int __net_init tcp_sk_init(struct net *net)
3075 {
3076         int cnt;
3077
3078         net->ipv4.sysctl_tcp_ecn = 2;
3079         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3080
3081         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3082         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3083         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3084         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3085         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3086
3087         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3088         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3089         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3090
3091         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3092         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3093         net->ipv4.sysctl_tcp_syncookies = 1;
3094         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3095         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3096         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3097         net->ipv4.sysctl_tcp_orphan_retries = 0;
3098         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3099         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3100         net->ipv4.sysctl_tcp_tw_reuse = 2;
3101         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3102
3103         net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3104         if (!net->ipv4.tcp_death_row)
3105                 return -ENOMEM;
3106         refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3107         cnt = tcp_hashinfo.ehash_mask + 1;
3108         net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3109         net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3110
3111         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3112         net->ipv4.sysctl_tcp_sack = 1;
3113         net->ipv4.sysctl_tcp_window_scaling = 1;
3114         net->ipv4.sysctl_tcp_timestamps = 1;
3115         net->ipv4.sysctl_tcp_early_retrans = 3;
3116         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3117         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3118         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3119         net->ipv4.sysctl_tcp_max_reordering = 300;
3120         net->ipv4.sysctl_tcp_dsack = 1;
3121         net->ipv4.sysctl_tcp_app_win = 31;
3122         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3123         net->ipv4.sysctl_tcp_frto = 2;
3124         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3125         /* This limits the percentage of the congestion window which we
3126          * will allow a single TSO frame to consume.  Building TSO frames
3127          * which are too large can cause TCP streams to be bursty.
3128          */
3129         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3130         /* Default TSQ limit of 16 TSO segments */
3131         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3132         /* rfc5961 challenge ack rate limiting */
3133         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3134         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3135         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3136         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3137         net->ipv4.sysctl_tcp_autocorking = 1;
3138         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3139         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3140         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3141         if (net != &init_net) {
3142                 memcpy(net->ipv4.sysctl_tcp_rmem,
3143                        init_net.ipv4.sysctl_tcp_rmem,
3144                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3145                 memcpy(net->ipv4.sysctl_tcp_wmem,
3146                        init_net.ipv4.sysctl_tcp_wmem,
3147                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3148         }
3149         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3150         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3151         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3152         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3153         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3154         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3155
3156         /* Reno is always built in */
3157         if (!net_eq(net, &init_net) &&
3158             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3159                                init_net.ipv4.tcp_congestion_control->owner))
3160                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3161         else
3162                 net->ipv4.tcp_congestion_control = &tcp_reno;
3163
3164         return 0;
3165 }
3166
3167 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3168 {
3169         struct net *net;
3170
3171         list_for_each_entry(net, net_exit_list, exit_list)
3172                 tcp_fastopen_ctx_destroy(net);
3173 }
3174
3175 static struct pernet_operations __net_initdata tcp_sk_ops = {
3176        .init       = tcp_sk_init,
3177        .exit       = tcp_sk_exit,
3178        .exit_batch = tcp_sk_exit_batch,
3179 };
3180
3181 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3182 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3183                      struct sock_common *sk_common, uid_t uid)
3184
3185 #define INIT_BATCH_SZ 16
3186
3187 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3188 {
3189         struct bpf_tcp_iter_state *iter = priv_data;
3190         int err;
3191
3192         err = bpf_iter_init_seq_net(priv_data, aux);
3193         if (err)
3194                 return err;
3195
3196         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3197         if (err) {
3198                 bpf_iter_fini_seq_net(priv_data);
3199                 return err;
3200         }
3201
3202         return 0;
3203 }
3204
3205 static void bpf_iter_fini_tcp(void *priv_data)
3206 {
3207         struct bpf_tcp_iter_state *iter = priv_data;
3208
3209         bpf_iter_fini_seq_net(priv_data);
3210         kvfree(iter->batch);
3211 }
3212
3213 static const struct bpf_iter_seq_info tcp_seq_info = {
3214         .seq_ops                = &bpf_iter_tcp_seq_ops,
3215         .init_seq_private       = bpf_iter_init_tcp,
3216         .fini_seq_private       = bpf_iter_fini_tcp,
3217         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3218 };
3219
3220 static const struct bpf_func_proto *
3221 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3222                             const struct bpf_prog *prog)
3223 {
3224         switch (func_id) {
3225         case BPF_FUNC_setsockopt:
3226                 return &bpf_sk_setsockopt_proto;
3227         case BPF_FUNC_getsockopt:
3228                 return &bpf_sk_getsockopt_proto;
3229         default:
3230                 return NULL;
3231         }
3232 }
3233
3234 static struct bpf_iter_reg tcp_reg_info = {
3235         .target                 = "tcp",
3236         .ctx_arg_info_size      = 1,
3237         .ctx_arg_info           = {
3238                 { offsetof(struct bpf_iter__tcp, sk_common),
3239                   PTR_TO_BTF_ID_OR_NULL },
3240         },
3241         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3242         .seq_info               = &tcp_seq_info,
3243 };
3244
3245 static void __init bpf_iter_register(void)
3246 {
3247         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3248         if (bpf_iter_reg_target(&tcp_reg_info))
3249                 pr_warn("Warning: could not register bpf iterator tcp\n");
3250 }
3251
3252 #endif
3253
3254 void __init tcp_v4_init(void)
3255 {
3256         int cpu, res;
3257
3258         for_each_possible_cpu(cpu) {
3259                 struct sock *sk;
3260
3261                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3262                                            IPPROTO_TCP, &init_net);
3263                 if (res)
3264                         panic("Failed to create the TCP control socket.\n");
3265                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3266
3267                 /* Please enforce IP_DF and IPID==0 for RST and
3268                  * ACK sent in SYN-RECV and TIME-WAIT state.
3269                  */
3270                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3271
3272                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3273         }
3274         if (register_pernet_subsys(&tcp_sk_ops))
3275                 panic("Failed to create the TCP control socket.\n");
3276
3277 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3278         bpf_iter_register();
3279 #endif
3280 }