net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                 goto out;
 514         }
 515
 516         tp = tcp_sk(sk);
 517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518         fastopen = rcu_dereference(tp->fastopen_rsk);
 519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520         if (sk->sk_state != TCP_LISTEN &&
 521             !between(seq, snd_una, tp->snd_nxt)) {
 522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                 goto out;
 524         }
 525
 526         switch (type) {
 527         case ICMP_REDIRECT:
 528                 if (!sock_owned_by_user(sk))
 529                         do_redirect(skb, sk);
 530                 goto out;
 531         case ICMP_SOURCE_QUENCH:
 532                 /* Just silently ignore these. */
 533                 goto out;
 534         case ICMP_PARAMETERPROB:
 535                 err = EPROTO;
 536                 break;
 537         case ICMP_DEST_UNREACH:
 538                 if (code > NR_ICMP_UNREACH)
 539                         goto out;
 540
 541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                         /* We are not interested in TCP_LISTEN and open_requests
 543                          * (SYN-ACKs send out by Linux are always <576bytes so
 544                          * they should go through unfragmented).
 545                          */
 546                         if (sk->sk_state == TCP_LISTEN)
 547                                 goto out;
 548
 549                         WRITE_ONCE(tp->mtu_info, info);
 550                         if (!sock_owned_by_user(sk)) {
 551                                 tcp_v4_mtu_reduced(sk);
 552                         } else {
 553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                         sock_hold(sk);
 555                         }
 556                         goto out;
 557                 }
 558
 559                 err = icmp_err_convert[code].errno;
 560                 /* check if this ICMP message allows revert of backoff.
 561                  * (see RFC 6069)
 562                  */
 563                 if (!fastopen &&
 564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                         tcp_ld_RTO_revert(sk, seq);
 566                 break;
 567         case ICMP_TIME_EXCEEDED:
 568                 err = EHOSTUNREACH;
 569                 break;
 570         default:
 571                 goto out;
 572         }
 573
 574         switch (sk->sk_state) {
 575         case TCP_SYN_SENT:
 576         case TCP_SYN_RECV:
 577                 /* Only in fast or simultaneous open. If a fast open socket is
 578                  * already accepted it is treated as a connected one below.
 579                  */
 580                 if (fastopen && !fastopen->sk)
 581                         break;
 582
 583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                 if (!sock_owned_by_user(sk)) {
 586                         sk->sk_err = err;
 587
 588                         sk_error_report(sk);
 589
 590                         tcp_done(sk);
 591                 } else {
 592                         sk->sk_err_soft = err;
 593                 }
 594                 goto out;
 595         }
 596
 597         /* If we've already connected we will keep trying
 598          * until we time out, or the user gives up.
 599          *
 600          * rfc1122 4.2.3.9 allows to consider as hard errors
 601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602          * but it is obsoleted by pmtu discovery).
 603          *
 604          * Note, that in modern internet, where routing is unreliable
 605          * and in each dark corner broken firewalls sit, sending random
 606          * errors ordered by their masters even this two messages finally lose
 607          * their original sense (even Linux sends invalid PORT_UNREACHs)
 608          *
 609          * Now we are in compliance with RFCs.
 610          *                                                      --ANK (980905)
 611          */
 612
 613         inet = inet_sk(sk);
 614         if (!sock_owned_by_user(sk) && inet->recverr) {
 615                 sk->sk_err = err;
 616                 sk_error_report(sk);
 617         } else  { /* Only an error on timeout */
 618                 sk->sk_err_soft = err;
 619         }
 620
 621 out:
 622         bh_unlock_sock(sk);
 623         sock_put(sk);
 624         return 0;
 625 }
 626
 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628 {
 629         struct tcphdr *th = tcp_hdr(skb);
 630
 631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632         skb->csum_start = skb_transport_header(skb) - skb->head;
 633         skb->csum_offset = offsetof(struct tcphdr, check);
 634 }
 635
 636 /* This routine computes an IPv4 TCP checksum. */
 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638 {
 639         const struct inet_sock *inet = inet_sk(sk);
 640
 641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642 }
 643 EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645 /*
 646  *      This routine will send an RST to the other tcp.
 647  *
 648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649  *                    for reset.
 650  *      Answer: if a packet caused RST, it is not for a socket
 651  *              existing in our system, if it is matched to a socket,
 652  *              it is just duplicate segment or bug in other side's TCP.
 653  *              So that we build reply only basing on parameters
 654  *              arrived with segment.
 655  *      Exception: precedence violation. We do not implement it in any case.
 656  */
 657
 658 #ifdef CONFIG_TCP_MD5SIG
 659 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 660 #else
 661 #define OPTION_BYTES sizeof(__be32)
 662 #endif
 663
 664 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 665 {
 666         const struct tcphdr *th = tcp_hdr(skb);
 667         struct {
 668                 struct tcphdr th;
 669                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 670         } rep;
 671         struct ip_reply_arg arg;
 672 #ifdef CONFIG_TCP_MD5SIG
 673         struct tcp_md5sig_key *key = NULL;
 674         const __u8 *hash_location = NULL;
 675         unsigned char newhash[16];
 676         int genhash;
 677         struct sock *sk1 = NULL;
 678 #endif
 679         u64 transmit_time = 0;
 680         struct sock *ctl_sk;
 681         struct net *net;
 682
 683         /* Never send a reset in response to a reset. */
 684         if (th->rst)
 685                 return;
 686
 687         /* If sk not NULL, it means we did a successful lookup and incoming
 688          * route had to be correct. prequeue might have dropped our dst.
 689          */
 690         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 691                 return;
 692
 693         /* Swap the send and the receive. */
 694         memset(&rep, 0, sizeof(rep));
 695         rep.th.dest   = th->source;
 696         rep.th.source = th->dest;
 697         rep.th.doff   = sizeof(struct tcphdr) / 4;
 698         rep.th.rst    = 1;
 699
 700         if (th->ack) {
 701                 rep.th.seq = th->ack_seq;
 702         } else {
 703                 rep.th.ack = 1;
 704                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 705                                        skb->len - (th->doff << 2));
 706         }
 707
 708         memset(&arg, 0, sizeof(arg));
 709         arg.iov[0].iov_base = (unsigned char *)&rep;
 710         arg.iov[0].iov_len  = sizeof(rep.th);
 711
 712         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 713 #ifdef CONFIG_TCP_MD5SIG
 714         rcu_read_lock();
 715         hash_location = tcp_parse_md5sig_option(th);
 716         if (sk && sk_fullsock(sk)) {
 717                 const union tcp_md5_addr *addr;
 718                 int l3index;
 719
 720                 /* sdif set, means packet ingressed via a device
 721                  * in an L3 domain and inet_iif is set to it.
 722                  */
 723                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 724                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 725                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 726         } else if (hash_location) {
 727                 const union tcp_md5_addr *addr;
 728                 int sdif = tcp_v4_sdif(skb);
 729                 int dif = inet_iif(skb);
 730                 int l3index;
 731
 732                 /*
 733                  * active side is lost. Try to find listening socket through
 734                  * source port, and then find md5 key through listening socket.
 735                  * we are not loose security here:
 736                  * Incoming packet is checked with md5 hash with finding key,
 737                  * no RST generated if md5 hash doesn't match.
 738                  */
 739                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 740                                              ip_hdr(skb)->saddr,
 741                                              th->source, ip_hdr(skb)->daddr,
 742                                              ntohs(th->source), dif, sdif);
 743                 /* don't send rst if it can't find key */
 744                 if (!sk1)
 745                         goto out;
 746
 747                 /* sdif set, means packet ingressed via a device
 748                  * in an L3 domain and dif is set to it.
 749                  */
 750                 l3index = sdif ? dif : 0;
 751                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 752                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 753                 if (!key)
 754                         goto out;
 755
 756
 757                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 758                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 759                         goto out;
 760
 761         }
 762
 763         if (key) {
 764                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 765                                    (TCPOPT_NOP << 16) |
 766                                    (TCPOPT_MD5SIG << 8) |
 767                                    TCPOLEN_MD5SIG);
 768                 /* Update length and the length the header thinks exists */
 769                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 770                 rep.th.doff = arg.iov[0].iov_len / 4;
 771
 772                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 773                                      key, ip_hdr(skb)->saddr,
 774                                      ip_hdr(skb)->daddr, &rep.th);
 775         }
 776 #endif
 777         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 778         if (rep.opt[0] == 0) {
 779                 __be32 mrst = mptcp_reset_option(skb);
 780
 781                 if (mrst) {
 782                         rep.opt[0] = mrst;
 783                         arg.iov[0].iov_len += sizeof(mrst);
 784                         rep.th.doff = arg.iov[0].iov_len / 4;
 785                 }
 786         }
 787
 788         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 789                                       ip_hdr(skb)->saddr, /* XXX */
 790                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 791         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 792         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 793
 794         /* When socket is gone, all binding information is lost.
 795          * routing might fail in this case. No choice here, if we choose to force
 796          * input interface, we will misroute in case of asymmetric route.
 797          */
 798         if (sk) {
 799                 arg.bound_dev_if = sk->sk_bound_dev_if;
 800                 if (sk_fullsock(sk))
 801                         trace_tcp_send_reset(sk, skb);
 802         }
 803
 804         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 805                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 806
 807         arg.tos = ip_hdr(skb)->tos;
 808         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 809         local_bh_disable();
 810         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 811         if (sk) {
 812                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 813                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 814                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 815                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 816                 transmit_time = tcp_transmit_time(sk);
 817         }
 818         ip_send_unicast_reply(ctl_sk,
 819                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 820                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 821                               &arg, arg.iov[0].iov_len,
 822                               transmit_time);
 823
 824         ctl_sk->sk_mark = 0;
 825         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 826         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 827         local_bh_enable();
 828
 829 #ifdef CONFIG_TCP_MD5SIG
 830 out:
 831         rcu_read_unlock();
 832 #endif
 833 }
 834
 835 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 836    outside socket context is ugly, certainly. What can I do?
 837  */
 838
 839 static void tcp_v4_send_ack(const struct sock *sk,
 840                             struct sk_buff *skb, u32 seq, u32 ack,
 841                             u32 win, u32 tsval, u32 tsecr, int oif,
 842                             struct tcp_md5sig_key *key,
 843                             int reply_flags, u8 tos)
 844 {
 845         const struct tcphdr *th = tcp_hdr(skb);
 846         struct {
 847                 struct tcphdr th;
 848                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 849 #ifdef CONFIG_TCP_MD5SIG
 850                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 851 #endif
 852                         ];
 853         } rep;
 854         struct net *net = sock_net(sk);
 855         struct ip_reply_arg arg;
 856         struct sock *ctl_sk;
 857         u64 transmit_time;
 858
 859         memset(&rep.th, 0, sizeof(struct tcphdr));
 860         memset(&arg, 0, sizeof(arg));
 861
 862         arg.iov[0].iov_base = (unsigned char *)&rep;
 863         arg.iov[0].iov_len  = sizeof(rep.th);
 864         if (tsecr) {
 865                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 866                                    (TCPOPT_TIMESTAMP << 8) |
 867                                    TCPOLEN_TIMESTAMP);
 868                 rep.opt[1] = htonl(tsval);
 869                 rep.opt[2] = htonl(tsecr);
 870                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 871         }
 872
 873         /* Swap the send and the receive. */
 874         rep.th.dest    = th->source;
 875         rep.th.source  = th->dest;
 876         rep.th.doff    = arg.iov[0].iov_len / 4;
 877         rep.th.seq     = htonl(seq);
 878         rep.th.ack_seq = htonl(ack);
 879         rep.th.ack     = 1;
 880         rep.th.window  = htons(win);
 881
 882 #ifdef CONFIG_TCP_MD5SIG
 883         if (key) {
 884                 int offset = (tsecr) ? 3 : 0;
 885
 886                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 887                                           (TCPOPT_NOP << 16) |
 888                                           (TCPOPT_MD5SIG << 8) |
 889                                           TCPOLEN_MD5SIG);
 890                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 891                 rep.th.doff = arg.iov[0].iov_len/4;
 892
 893                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 894                                     key, ip_hdr(skb)->saddr,
 895                                     ip_hdr(skb)->daddr, &rep.th);
 896         }
 897 #endif
 898         arg.flags = reply_flags;
 899         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 900                                       ip_hdr(skb)->saddr, /* XXX */
 901                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 902         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 903         if (oif)
 904                 arg.bound_dev_if = oif;
 905         arg.tos = tos;
 906         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 907         local_bh_disable();
 908         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 909         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 910                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 911         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 912                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 913         transmit_time = tcp_transmit_time(sk);
 914         ip_send_unicast_reply(ctl_sk,
 915                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 916                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 917                               &arg, arg.iov[0].iov_len,
 918                               transmit_time);
 919
 920         ctl_sk->sk_mark = 0;
 921         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 922         local_bh_enable();
 923 }
 924
 925 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 926 {
 927         struct inet_timewait_sock *tw = inet_twsk(sk);
 928         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 929
 930         tcp_v4_send_ack(sk, skb,
 931                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 932                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 933                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 934                         tcptw->tw_ts_recent,
 935                         tw->tw_bound_dev_if,
 936                         tcp_twsk_md5_key(tcptw),
 937                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 938                         tw->tw_tos
 939                         );
 940
 941         inet_twsk_put(tw);
 942 }
 943
 944 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 945                                   struct request_sock *req)
 946 {
 947         const union tcp_md5_addr *addr;
 948         int l3index;
 949
 950         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 951          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 952          */
 953         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 954                                              tcp_sk(sk)->snd_nxt;
 955
 956         /* RFC 7323 2.3
 957          * The window field (SEG.WND) of every outgoing segment, with the
 958          * exception of <SYN> segments, MUST be right-shifted by
 959          * Rcv.Wind.Shift bits:
 960          */
 961         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 962         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 963         tcp_v4_send_ack(sk, skb, seq,
 964                         tcp_rsk(req)->rcv_nxt,
 965                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 966                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 967                         req->ts_recent,
 968                         0,
 969                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 970                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 971                         ip_hdr(skb)->tos);
 972 }
 973
 974 /*
 975  *      Send a SYN-ACK after having received a SYN.
 976  *      This still operates on a request_sock only, not on a big
 977  *      socket.
 978  */
 979 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 980                               struct flowi *fl,
 981                               struct request_sock *req,
 982                               struct tcp_fastopen_cookie *foc,
 983                               enum tcp_synack_type synack_type,
 984                               struct sk_buff *syn_skb)
 985 {
 986         const struct inet_request_sock *ireq = inet_rsk(req);
 987         struct flowi4 fl4;
 988         int err = -1;
 989         struct sk_buff *skb;
 990         u8 tos;
 991
 992         /* First, grab a route. */
 993         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 994                 return -1;
 995
 996         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 997
 998         if (skb) {
 999                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000
1001                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1004                                 inet_sk(sk)->tos;
1005
1006                 if (!INET_ECN_is_capable(tos) &&
1007                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1008                         tos |= INET_ECN_ECT_0;
1009
1010                 rcu_read_lock();
1011                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012                                             ireq->ir_rmt_addr,
1013                                             rcu_dereference(ireq->ireq_opt),
1014                                             tos);
1015                 rcu_read_unlock();
1016                 err = net_xmit_eval(err);
1017         }
1018
1019         return err;
1020 }
1021
1022 /*
1023  *      IPv4 request_sock destructor.
1024  */
1025 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026 {
1027         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028 }
1029
1030 #ifdef CONFIG_TCP_MD5SIG
1031 /*
1032  * RFC2385 MD5 checksumming requires a mapping of
1033  * IP address->MD5 Key.
1034  * We need to maintain these in the sk structure.
1035  */
1036
1037 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038 EXPORT_SYMBOL(tcp_md5_needed);
1039
1040 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1041 {
1042         if (!old)
1043                 return true;
1044
1045         /* l3index always overrides non-l3index */
1046         if (old->l3index && new->l3index == 0)
1047                 return false;
1048         if (old->l3index == 0 && new->l3index)
1049                 return true;
1050
1051         return old->prefixlen < new->prefixlen;
1052 }
1053
1054 /* Find the Key structure for an address.  */
1055 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1056                                            const union tcp_md5_addr *addr,
1057                                            int family)
1058 {
1059         const struct tcp_sock *tp = tcp_sk(sk);
1060         struct tcp_md5sig_key *key;
1061         const struct tcp_md5sig_info *md5sig;
1062         __be32 mask;
1063         struct tcp_md5sig_key *best_match = NULL;
1064         bool match;
1065
1066         /* caller either holds rcu_read_lock() or socket lock */
1067         md5sig = rcu_dereference_check(tp->md5sig_info,
1068                                        lockdep_sock_is_held(sk));
1069         if (!md5sig)
1070                 return NULL;
1071
1072         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1073                                  lockdep_sock_is_held(sk)) {
1074                 if (key->family != family)
1075                         continue;
1076                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1077                         continue;
1078                 if (family == AF_INET) {
1079                         mask = inet_make_mask(key->prefixlen);
1080                         match = (key->addr.a4.s_addr & mask) ==
1081                                 (addr->a4.s_addr & mask);
1082 #if IS_ENABLED(CONFIG_IPV6)
1083                 } else if (family == AF_INET6) {
1084                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1085                                                   key->prefixlen);
1086 #endif
1087                 } else {
1088                         match = false;
1089                 }
1090
1091                 if (match && better_md5_match(best_match, key))
1092                         best_match = key;
1093         }
1094         return best_match;
1095 }
1096 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1097
1098 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1099                                                       const union tcp_md5_addr *addr,
1100                                                       int family, u8 prefixlen,
1101                                                       int l3index, u8 flags)
1102 {
1103         const struct tcp_sock *tp = tcp_sk(sk);
1104         struct tcp_md5sig_key *key;
1105         unsigned int size = sizeof(struct in_addr);
1106         const struct tcp_md5sig_info *md5sig;
1107
1108         /* caller either holds rcu_read_lock() or socket lock */
1109         md5sig = rcu_dereference_check(tp->md5sig_info,
1110                                        lockdep_sock_is_held(sk));
1111         if (!md5sig)
1112                 return NULL;
1113 #if IS_ENABLED(CONFIG_IPV6)
1114         if (family == AF_INET6)
1115                 size = sizeof(struct in6_addr);
1116 #endif
1117         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1118                                  lockdep_sock_is_held(sk)) {
1119                 if (key->family != family)
1120                         continue;
1121                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1122                         continue;
1123                 if (key->l3index != l3index)
1124                         continue;
1125                 if (!memcmp(&key->addr, addr, size) &&
1126                     key->prefixlen == prefixlen)
1127                         return key;
1128         }
1129         return NULL;
1130 }
1131
1132 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1133                                          const struct sock *addr_sk)
1134 {
1135         const union tcp_md5_addr *addr;
1136         int l3index;
1137
1138         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1139                                                  addr_sk->sk_bound_dev_if);
1140         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1141         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1142 }
1143 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1144
1145 /* This can be called on a newly created socket, from other files */
1146 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1147                    int family, u8 prefixlen, int l3index, u8 flags,
1148                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1149 {
1150         /* Add Key to the list */
1151         struct tcp_md5sig_key *key;
1152         struct tcp_sock *tp = tcp_sk(sk);
1153         struct tcp_md5sig_info *md5sig;
1154
1155         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1156         if (key) {
1157                 /* Pre-existing entry - just update that one.
1158                  * Note that the key might be used concurrently.
1159                  * data_race() is telling kcsan that we do not care of
1160                  * key mismatches, since changing MD5 key on live flows
1161                  * can lead to packet drops.
1162                  */
1163                 data_race(memcpy(key->key, newkey, newkeylen));
1164
1165                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1166                  * Also note that a reader could catch new key->keylen value
1167                  * but old key->key[], this is the reason we use __GFP_ZERO
1168                  * at sock_kmalloc() time below these lines.
1169                  */
1170                 WRITE_ONCE(key->keylen, newkeylen);
1171
1172                 return 0;
1173         }
1174
1175         md5sig = rcu_dereference_protected(tp->md5sig_info,
1176                                            lockdep_sock_is_held(sk));
1177         if (!md5sig) {
1178                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1179                 if (!md5sig)
1180                         return -ENOMEM;
1181
1182                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1183                 INIT_HLIST_HEAD(&md5sig->head);
1184                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1185         }
1186
1187         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1188         if (!key)
1189                 return -ENOMEM;
1190         if (!tcp_alloc_md5sig_pool()) {
1191                 sock_kfree_s(sk, key, sizeof(*key));
1192                 return -ENOMEM;
1193         }
1194
1195         memcpy(key->key, newkey, newkeylen);
1196         key->keylen = newkeylen;
1197         key->family = family;
1198         key->prefixlen = prefixlen;
1199         key->l3index = l3index;
1200         key->flags = flags;
1201         memcpy(&key->addr, addr,
1202                (family == AF_INET6) ? sizeof(struct in6_addr) :
1203                                       sizeof(struct in_addr));
1204         hlist_add_head_rcu(&key->node, &md5sig->head);
1205         return 0;
1206 }
1207 EXPORT_SYMBOL(tcp_md5_do_add);
1208
1209 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1210                    u8 prefixlen, int l3index, u8 flags)
1211 {
1212         struct tcp_md5sig_key *key;
1213
1214         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1215         if (!key)
1216                 return -ENOENT;
1217         hlist_del_rcu(&key->node);
1218         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1219         kfree_rcu(key, rcu);
1220         return 0;
1221 }
1222 EXPORT_SYMBOL(tcp_md5_do_del);
1223
1224 static void tcp_clear_md5_list(struct sock *sk)
1225 {
1226         struct tcp_sock *tp = tcp_sk(sk);
1227         struct tcp_md5sig_key *key;
1228         struct hlist_node *n;
1229         struct tcp_md5sig_info *md5sig;
1230
1231         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1232
1233         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1234                 hlist_del_rcu(&key->node);
1235                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1236                 kfree_rcu(key, rcu);
1237         }
1238 }
1239
1240 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1241                                  sockptr_t optval, int optlen)
1242 {
1243         struct tcp_md5sig cmd;
1244         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1245         const union tcp_md5_addr *addr;
1246         u8 prefixlen = 32;
1247         int l3index = 0;
1248         u8 flags;
1249
1250         if (optlen < sizeof(cmd))
1251                 return -EINVAL;
1252
1253         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1254                 return -EFAULT;
1255
1256         if (sin->sin_family != AF_INET)
1257                 return -EINVAL;
1258
1259         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1260
1261         if (optname == TCP_MD5SIG_EXT &&
1262             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1263                 prefixlen = cmd.tcpm_prefixlen;
1264                 if (prefixlen > 32)
1265                         return -EINVAL;
1266         }
1267
1268         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1269             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1270                 struct net_device *dev;
1271
1272                 rcu_read_lock();
1273                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1274                 if (dev && netif_is_l3_master(dev))
1275                         l3index = dev->ifindex;
1276
1277                 rcu_read_unlock();
1278
1279                 /* ok to reference set/not set outside of rcu;
1280                  * right now device MUST be an L3 master
1281                  */
1282                 if (!dev || !l3index)
1283                         return -EINVAL;
1284         }
1285
1286         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1287
1288         if (!cmd.tcpm_keylen)
1289                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1290
1291         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1292                 return -EINVAL;
1293
1294         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1295                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1296 }
1297
1298 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1299                                    __be32 daddr, __be32 saddr,
1300                                    const struct tcphdr *th, int nbytes)
1301 {
1302         struct tcp4_pseudohdr *bp;
1303         struct scatterlist sg;
1304         struct tcphdr *_th;
1305
1306         bp = hp->scratch;
1307         bp->saddr = saddr;
1308         bp->daddr = daddr;
1309         bp->pad = 0;
1310         bp->protocol = IPPROTO_TCP;
1311         bp->len = cpu_to_be16(nbytes);
1312
1313         _th = (struct tcphdr *)(bp + 1);
1314         memcpy(_th, th, sizeof(*th));
1315         _th->check = 0;
1316
1317         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1318         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1319                                 sizeof(*bp) + sizeof(*th));
1320         return crypto_ahash_update(hp->md5_req);
1321 }
1322
1323 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1324                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1325 {
1326         struct tcp_md5sig_pool *hp;
1327         struct ahash_request *req;
1328
1329         hp = tcp_get_md5sig_pool();
1330         if (!hp)
1331                 goto clear_hash_noput;
1332         req = hp->md5_req;
1333
1334         if (crypto_ahash_init(req))
1335                 goto clear_hash;
1336         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1337                 goto clear_hash;
1338         if (tcp_md5_hash_key(hp, key))
1339                 goto clear_hash;
1340         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1341         if (crypto_ahash_final(req))
1342                 goto clear_hash;
1343
1344         tcp_put_md5sig_pool();
1345         return 0;
1346
1347 clear_hash:
1348         tcp_put_md5sig_pool();
1349 clear_hash_noput:
1350         memset(md5_hash, 0, 16);
1351         return 1;
1352 }
1353
1354 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1355                         const struct sock *sk,
1356                         const struct sk_buff *skb)
1357 {
1358         struct tcp_md5sig_pool *hp;
1359         struct ahash_request *req;
1360         const struct tcphdr *th = tcp_hdr(skb);
1361         __be32 saddr, daddr;
1362
1363         if (sk) { /* valid for establish/request sockets */
1364                 saddr = sk->sk_rcv_saddr;
1365                 daddr = sk->sk_daddr;
1366         } else {
1367                 const struct iphdr *iph = ip_hdr(skb);
1368                 saddr = iph->saddr;
1369                 daddr = iph->daddr;
1370         }
1371
1372         hp = tcp_get_md5sig_pool();
1373         if (!hp)
1374                 goto clear_hash_noput;
1375         req = hp->md5_req;
1376
1377         if (crypto_ahash_init(req))
1378                 goto clear_hash;
1379
1380         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1381                 goto clear_hash;
1382         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1383                 goto clear_hash;
1384         if (tcp_md5_hash_key(hp, key))
1385                 goto clear_hash;
1386         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1387         if (crypto_ahash_final(req))
1388                 goto clear_hash;
1389
1390         tcp_put_md5sig_pool();
1391         return 0;
1392
1393 clear_hash:
1394         tcp_put_md5sig_pool();
1395 clear_hash_noput:
1396         memset(md5_hash, 0, 16);
1397         return 1;
1398 }
1399 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1400
1401 #endif
1402
1403 /* Called with rcu_read_lock() */
1404 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1405                                     const struct sk_buff *skb,
1406                                     int dif, int sdif)
1407 {
1408 #ifdef CONFIG_TCP_MD5SIG
1409         /*
1410          * This gets called for each TCP segment that arrives
1411          * so we want to be efficient.
1412          * We have 3 drop cases:
1413          * o No MD5 hash and one expected.
1414          * o MD5 hash and we're not expecting one.
1415          * o MD5 hash and its wrong.
1416          */
1417         const __u8 *hash_location = NULL;
1418         struct tcp_md5sig_key *hash_expected;
1419         const struct iphdr *iph = ip_hdr(skb);
1420         const struct tcphdr *th = tcp_hdr(skb);
1421         const union tcp_md5_addr *addr;
1422         unsigned char newhash[16];
1423         int genhash, l3index;
1424
1425         /* sdif set, means packet ingressed via a device
1426          * in an L3 domain and dif is set to the l3mdev
1427          */
1428         l3index = sdif ? dif : 0;
1429
1430         addr = (union tcp_md5_addr *)&iph->saddr;
1431         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1432         hash_location = tcp_parse_md5sig_option(th);
1433
1434         /* We've parsed the options - do we have a hash? */
1435         if (!hash_expected && !hash_location)
1436                 return false;
1437
1438         if (hash_expected && !hash_location) {
1439                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1440                 return true;
1441         }
1442
1443         if (!hash_expected && hash_location) {
1444                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1445                 return true;
1446         }
1447
1448         /* Okay, so this is hash_expected and hash_location -
1449          * so we need to calculate the checksum.
1450          */
1451         genhash = tcp_v4_md5_hash_skb(newhash,
1452                                       hash_expected,
1453                                       NULL, skb);
1454
1455         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1456                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1457                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1458                                      &iph->saddr, ntohs(th->source),
1459                                      &iph->daddr, ntohs(th->dest),
1460                                      genhash ? " tcp_v4_calc_md5_hash failed"
1461                                      : "", l3index);
1462                 return true;
1463         }
1464         return false;
1465 #endif
1466         return false;
1467 }
1468
1469 static void tcp_v4_init_req(struct request_sock *req,
1470                             const struct sock *sk_listener,
1471                             struct sk_buff *skb)
1472 {
1473         struct inet_request_sock *ireq = inet_rsk(req);
1474         struct net *net = sock_net(sk_listener);
1475
1476         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1477         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1478         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1479 }
1480
1481 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1482                                           struct sk_buff *skb,
1483                                           struct flowi *fl,
1484                                           struct request_sock *req)
1485 {
1486         tcp_v4_init_req(req, sk, skb);
1487
1488         if (security_inet_conn_request(sk, skb, req))
1489                 return NULL;
1490
1491         return inet_csk_route_req(sk, &fl->u.ip4, req);
1492 }
1493
1494 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1495         .family         =       PF_INET,
1496         .obj_size       =       sizeof(struct tcp_request_sock),
1497         .rtx_syn_ack    =       tcp_rtx_synack,
1498         .send_ack       =       tcp_v4_reqsk_send_ack,
1499         .destructor     =       tcp_v4_reqsk_destructor,
1500         .send_reset     =       tcp_v4_send_reset,
1501         .syn_ack_timeout =      tcp_syn_ack_timeout,
1502 };
1503
1504 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1505         .mss_clamp      =       TCP_MSS_DEFAULT,
1506 #ifdef CONFIG_TCP_MD5SIG
1507         .req_md5_lookup =       tcp_v4_md5_lookup,
1508         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1509 #endif
1510 #ifdef CONFIG_SYN_COOKIES
1511         .cookie_init_seq =      cookie_v4_init_sequence,
1512 #endif
1513         .route_req      =       tcp_v4_route_req,
1514         .init_seq       =       tcp_v4_init_seq,
1515         .init_ts_off    =       tcp_v4_init_ts_off,
1516         .send_synack    =       tcp_v4_send_synack,
1517 };
1518
1519 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1520 {
1521         /* Never answer to SYNs send to broadcast or multicast */
1522         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1523                 goto drop;
1524
1525         return tcp_conn_request(&tcp_request_sock_ops,
1526                                 &tcp_request_sock_ipv4_ops, sk, skb);
1527
1528 drop:
1529         tcp_listendrop(sk);
1530         return 0;
1531 }
1532 EXPORT_SYMBOL(tcp_v4_conn_request);
1533
1534
1535 /*
1536  * The three way handshake has completed - we got a valid synack -
1537  * now create the new socket.
1538  */
1539 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1540                                   struct request_sock *req,
1541                                   struct dst_entry *dst,
1542                                   struct request_sock *req_unhash,
1543                                   bool *own_req)
1544 {
1545         struct inet_request_sock *ireq;
1546         bool found_dup_sk = false;
1547         struct inet_sock *newinet;
1548         struct tcp_sock *newtp;
1549         struct sock *newsk;
1550 #ifdef CONFIG_TCP_MD5SIG
1551         const union tcp_md5_addr *addr;
1552         struct tcp_md5sig_key *key;
1553         int l3index;
1554 #endif
1555         struct ip_options_rcu *inet_opt;
1556
1557         if (sk_acceptq_is_full(sk))
1558                 goto exit_overflow;
1559
1560         newsk = tcp_create_openreq_child(sk, req, skb);
1561         if (!newsk)
1562                 goto exit_nonewsk;
1563
1564         newsk->sk_gso_type = SKB_GSO_TCPV4;
1565         inet_sk_rx_dst_set(newsk, skb);
1566
1567         newtp                 = tcp_sk(newsk);
1568         newinet               = inet_sk(newsk);
1569         ireq                  = inet_rsk(req);
1570         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1571         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1572         newsk->sk_bound_dev_if = ireq->ir_iif;
1573         newinet->inet_saddr   = ireq->ir_loc_addr;
1574         inet_opt              = rcu_dereference(ireq->ireq_opt);
1575         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1576         newinet->mc_index     = inet_iif(skb);
1577         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1578         newinet->rcv_tos      = ip_hdr(skb)->tos;
1579         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1580         if (inet_opt)
1581                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1582         newinet->inet_id = prandom_u32();
1583
1584         /* Set ToS of the new socket based upon the value of incoming SYN.
1585          * ECT bits are set later in tcp_init_transfer().
1586          */
1587         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1588                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1589
1590         if (!dst) {
1591                 dst = inet_csk_route_child_sock(sk, newsk, req);
1592                 if (!dst)
1593                         goto put_and_exit;
1594         } else {
1595                 /* syncookie case : see end of cookie_v4_check() */
1596         }
1597         sk_setup_caps(newsk, dst);
1598
1599         tcp_ca_openreq_child(newsk, dst);
1600
1601         tcp_sync_mss(newsk, dst_mtu(dst));
1602         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1603
1604         tcp_initialize_rcv_mss(newsk);
1605
1606 #ifdef CONFIG_TCP_MD5SIG
1607         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1608         /* Copy over the MD5 key from the original socket */
1609         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1610         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1611         if (key) {
1612                 /*
1613                  * We're using one, so create a matching key
1614                  * on the newsk structure. If we fail to get
1615                  * memory, then we end up not copying the key
1616                  * across. Shucks.
1617                  */
1618                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1619                                key->key, key->keylen, GFP_ATOMIC);
1620                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1621         }
1622 #endif
1623
1624         if (__inet_inherit_port(sk, newsk) < 0)
1625                 goto put_and_exit;
1626         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1627                                        &found_dup_sk);
1628         if (likely(*own_req)) {
1629                 tcp_move_syn(newtp, req);
1630                 ireq->ireq_opt = NULL;
1631         } else {
1632                 newinet->inet_opt = NULL;
1633
1634                 if (!req_unhash && found_dup_sk) {
1635                         /* This code path should only be executed in the
1636                          * syncookie case only
1637                          */
1638                         bh_unlock_sock(newsk);
1639                         sock_put(newsk);
1640                         newsk = NULL;
1641                 }
1642         }
1643         return newsk;
1644
1645 exit_overflow:
1646         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1647 exit_nonewsk:
1648         dst_release(dst);
1649 exit:
1650         tcp_listendrop(sk);
1651         return NULL;
1652 put_and_exit:
1653         newinet->inet_opt = NULL;
1654         inet_csk_prepare_forced_close(newsk);
1655         tcp_done(newsk);
1656         goto exit;
1657 }
1658 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1659
1660 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1661 {
1662 #ifdef CONFIG_SYN_COOKIES
1663         const struct tcphdr *th = tcp_hdr(skb);
1664
1665         if (!th->syn)
1666                 sk = cookie_v4_check(sk, skb);
1667 #endif
1668         return sk;
1669 }
1670
1671 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1672                          struct tcphdr *th, u32 *cookie)
1673 {
1674         u16 mss = 0;
1675 #ifdef CONFIG_SYN_COOKIES
1676         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1677                                     &tcp_request_sock_ipv4_ops, sk, th);
1678         if (mss) {
1679                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1680                 tcp_synq_overflow(sk);
1681         }
1682 #endif
1683         return mss;
1684 }
1685
1686 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1687                                                            u32));
1688 /* The socket must have it's spinlock held when we get
1689  * here, unless it is a TCP_LISTEN socket.
1690  *
1691  * We have a potential double-lock case here, so even when
1692  * doing backlog processing we use the BH locking scheme.
1693  * This is because we cannot sleep with the original spinlock
1694  * held.
1695  */
1696 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1697 {
1698         struct sock *rsk;
1699
1700         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1701                 struct dst_entry *dst;
1702
1703                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1704                                                 lockdep_sock_is_held(sk));
1705
1706                 sock_rps_save_rxhash(sk, skb);
1707                 sk_mark_napi_id(sk, skb);
1708                 if (dst) {
1709                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1710                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1711                                              dst, 0)) {
1712                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1713                                 dst_release(dst);
1714                         }
1715                 }
1716                 tcp_rcv_established(sk, skb);
1717                 return 0;
1718         }
1719
1720         if (tcp_checksum_complete(skb))
1721                 goto csum_err;
1722
1723         if (sk->sk_state == TCP_LISTEN) {
1724                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1725
1726                 if (!nsk)
1727                         goto discard;
1728                 if (nsk != sk) {
1729                         if (tcp_child_process(sk, nsk, skb)) {
1730                                 rsk = nsk;
1731                                 goto reset;
1732                         }
1733                         return 0;
1734                 }
1735         } else
1736                 sock_rps_save_rxhash(sk, skb);
1737
1738         if (tcp_rcv_state_process(sk, skb)) {
1739                 rsk = sk;
1740                 goto reset;
1741         }
1742         return 0;
1743
1744 reset:
1745         tcp_v4_send_reset(rsk, skb);
1746 discard:
1747         kfree_skb(skb);
1748         /* Be careful here. If this function gets more complicated and
1749          * gcc suffers from register pressure on the x86, sk (in %ebx)
1750          * might be destroyed here. This current version compiles correctly,
1751          * but you have been warned.
1752          */
1753         return 0;
1754
1755 csum_err:
1756         trace_tcp_bad_csum(skb);
1757         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1758         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1759         goto discard;
1760 }
1761 EXPORT_SYMBOL(tcp_v4_do_rcv);
1762
1763 int tcp_v4_early_demux(struct sk_buff *skb)
1764 {
1765         const struct iphdr *iph;
1766         const struct tcphdr *th;
1767         struct sock *sk;
1768
1769         if (skb->pkt_type != PACKET_HOST)
1770                 return 0;
1771
1772         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1773                 return 0;
1774
1775         iph = ip_hdr(skb);
1776         th = tcp_hdr(skb);
1777
1778         if (th->doff < sizeof(struct tcphdr) / 4)
1779                 return 0;
1780
1781         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1782                                        iph->saddr, th->source,
1783                                        iph->daddr, ntohs(th->dest),
1784                                        skb->skb_iif, inet_sdif(skb));
1785         if (sk) {
1786                 skb->sk = sk;
1787                 skb->destructor = sock_edemux;
1788                 if (sk_fullsock(sk)) {
1789                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1790
1791                         if (dst)
1792                                 dst = dst_check(dst, 0);
1793                         if (dst &&
1794                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1795                                 skb_dst_set_noref(skb, dst);
1796                 }
1797         }
1798         return 0;
1799 }
1800
1801 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1802 {
1803         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1804         u32 tail_gso_size, tail_gso_segs;
1805         struct skb_shared_info *shinfo;
1806         const struct tcphdr *th;
1807         struct tcphdr *thtail;
1808         struct sk_buff *tail;
1809         unsigned int hdrlen;
1810         bool fragstolen;
1811         u32 gso_segs;
1812         u32 gso_size;
1813         int delta;
1814
1815         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1816          * we can fix skb->truesize to its real value to avoid future drops.
1817          * This is valid because skb is not yet charged to the socket.
1818          * It has been noticed pure SACK packets were sometimes dropped
1819          * (if cooked by drivers without copybreak feature).
1820          */
1821         skb_condense(skb);
1822
1823         skb_dst_drop(skb);
1824
1825         if (unlikely(tcp_checksum_complete(skb))) {
1826                 bh_unlock_sock(sk);
1827                 trace_tcp_bad_csum(skb);
1828                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1829                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1830                 return true;
1831         }
1832
1833         /* Attempt coalescing to last skb in backlog, even if we are
1834          * above the limits.
1835          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1836          */
1837         th = (const struct tcphdr *)skb->data;
1838         hdrlen = th->doff * 4;
1839
1840         tail = sk->sk_backlog.tail;
1841         if (!tail)
1842                 goto no_coalesce;
1843         thtail = (struct tcphdr *)tail->data;
1844
1845         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1846             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1847             ((TCP_SKB_CB(tail)->tcp_flags |
1848               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1849             !((TCP_SKB_CB(tail)->tcp_flags &
1850               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1851             ((TCP_SKB_CB(tail)->tcp_flags ^
1852               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1853 #ifdef CONFIG_TLS_DEVICE
1854             tail->decrypted != skb->decrypted ||
1855 #endif
1856             thtail->doff != th->doff ||
1857             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1858                 goto no_coalesce;
1859
1860         __skb_pull(skb, hdrlen);
1861
1862         shinfo = skb_shinfo(skb);
1863         gso_size = shinfo->gso_size ?: skb->len;
1864         gso_segs = shinfo->gso_segs ?: 1;
1865
1866         shinfo = skb_shinfo(tail);
1867         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1868         tail_gso_segs = shinfo->gso_segs ?: 1;
1869
1870         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1871                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1872
1873                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1874                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1875                         thtail->window = th->window;
1876                 }
1877
1878                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1879                  * thtail->fin, so that the fast path in tcp_rcv_established()
1880                  * is not entered if we append a packet with a FIN.
1881                  * SYN, RST, URG are not present.
1882                  * ACK is set on both packets.
1883                  * PSH : we do not really care in TCP stack,
1884                  *       at least for 'GRO' packets.
1885                  */
1886                 thtail->fin |= th->fin;
1887                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1888
1889                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1890                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1891                         tail->tstamp = skb->tstamp;
1892                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1893                 }
1894
1895                 /* Not as strict as GRO. We only need to carry mss max value */
1896                 shinfo->gso_size = max(gso_size, tail_gso_size);
1897                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1898
1899                 sk->sk_backlog.len += delta;
1900                 __NET_INC_STATS(sock_net(sk),
1901                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1902                 kfree_skb_partial(skb, fragstolen);
1903                 return false;
1904         }
1905         __skb_push(skb, hdrlen);
1906
1907 no_coalesce:
1908         /* Only socket owner can try to collapse/prune rx queues
1909          * to reduce memory overhead, so add a little headroom here.
1910          * Few sockets backlog are possibly concurrently non empty.
1911          */
1912         limit += 64*1024;
1913
1914         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1915                 bh_unlock_sock(sk);
1916                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1917                 return true;
1918         }
1919         return false;
1920 }
1921 EXPORT_SYMBOL(tcp_add_backlog);
1922
1923 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1924 {
1925         struct tcphdr *th = (struct tcphdr *)skb->data;
1926
1927         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1928 }
1929 EXPORT_SYMBOL(tcp_filter);
1930
1931 static void tcp_v4_restore_cb(struct sk_buff *skb)
1932 {
1933         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1934                 sizeof(struct inet_skb_parm));
1935 }
1936
1937 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1938                            const struct tcphdr *th)
1939 {
1940         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1941          * barrier() makes sure compiler wont play fool^Waliasing games.
1942          */
1943         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1944                 sizeof(struct inet_skb_parm));
1945         barrier();
1946
1947         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1948         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1949                                     skb->len - th->doff * 4);
1950         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1951         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1952         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1953         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1954         TCP_SKB_CB(skb)->sacked  = 0;
1955         TCP_SKB_CB(skb)->has_rxtstamp =
1956                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1957 }
1958
1959 /*
1960  *      From tcp_input.c
1961  */
1962
1963 int tcp_v4_rcv(struct sk_buff *skb)
1964 {
1965         struct net *net = dev_net(skb->dev);
1966         struct sk_buff *skb_to_free;
1967         int sdif = inet_sdif(skb);
1968         int dif = inet_iif(skb);
1969         const struct iphdr *iph;
1970         const struct tcphdr *th;
1971         bool refcounted;
1972         struct sock *sk;
1973         int ret;
1974
1975         if (skb->pkt_type != PACKET_HOST)
1976                 goto discard_it;
1977
1978         /* Count it even if it's bad */
1979         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1980
1981         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1982                 goto discard_it;
1983
1984         th = (const struct tcphdr *)skb->data;
1985
1986         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1987                 goto bad_packet;
1988         if (!pskb_may_pull(skb, th->doff * 4))
1989                 goto discard_it;
1990
1991         /* An explanation is required here, I think.
1992          * Packet length and doff are validated by header prediction,
1993          * provided case of th->doff==0 is eliminated.
1994          * So, we defer the checks. */
1995
1996         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1997                 goto csum_error;
1998
1999         th = (const struct tcphdr *)skb->data;
2000         iph = ip_hdr(skb);
2001 lookup:
2002         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2003                                th->dest, sdif, &refcounted);
2004         if (!sk)
2005                 goto no_tcp_socket;
2006
2007 process:
2008         if (sk->sk_state == TCP_TIME_WAIT)
2009                 goto do_time_wait;
2010
2011         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2012                 struct request_sock *req = inet_reqsk(sk);
2013                 bool req_stolen = false;
2014                 struct sock *nsk;
2015
2016                 sk = req->rsk_listener;
2017                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2018                         sk_drops_add(sk, skb);
2019                         reqsk_put(req);
2020                         goto discard_it;
2021                 }
2022                 if (tcp_checksum_complete(skb)) {
2023                         reqsk_put(req);
2024                         goto csum_error;
2025                 }
2026                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2027                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2028                         if (!nsk) {
2029                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2030                                 goto lookup;
2031                         }
2032                         sk = nsk;
2033                         /* reuseport_migrate_sock() has already held one sk_refcnt
2034                          * before returning.
2035                          */
2036                 } else {
2037                         /* We own a reference on the listener, increase it again
2038                          * as we might lose it too soon.
2039                          */
2040                         sock_hold(sk);
2041                 }
2042                 refcounted = true;
2043                 nsk = NULL;
2044                 if (!tcp_filter(sk, skb)) {
2045                         th = (const struct tcphdr *)skb->data;
2046                         iph = ip_hdr(skb);
2047                         tcp_v4_fill_cb(skb, iph, th);
2048                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2049                 }
2050                 if (!nsk) {
2051                         reqsk_put(req);
2052                         if (req_stolen) {
2053                                 /* Another cpu got exclusive access to req
2054                                  * and created a full blown socket.
2055                                  * Try to feed this packet to this socket
2056                                  * instead of discarding it.
2057                                  */
2058                                 tcp_v4_restore_cb(skb);
2059                                 sock_put(sk);
2060                                 goto lookup;
2061                         }
2062                         goto discard_and_relse;
2063                 }
2064                 if (nsk == sk) {
2065                         reqsk_put(req);
2066                         tcp_v4_restore_cb(skb);
2067                 } else if (tcp_child_process(sk, nsk, skb)) {
2068                         tcp_v4_send_reset(nsk, skb);
2069                         goto discard_and_relse;
2070                 } else {
2071                         sock_put(sk);
2072                         return 0;
2073                 }
2074         }
2075         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2076                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2077                 goto discard_and_relse;
2078         }
2079
2080         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2081                 goto discard_and_relse;
2082
2083         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2084                 goto discard_and_relse;
2085
2086         nf_reset_ct(skb);
2087
2088         if (tcp_filter(sk, skb))
2089                 goto discard_and_relse;
2090         th = (const struct tcphdr *)skb->data;
2091         iph = ip_hdr(skb);
2092         tcp_v4_fill_cb(skb, iph, th);
2093
2094         skb->dev = NULL;
2095
2096         if (sk->sk_state == TCP_LISTEN) {
2097                 ret = tcp_v4_do_rcv(sk, skb);
2098                 goto put_and_return;
2099         }
2100
2101         sk_incoming_cpu_update(sk);
2102
2103         bh_lock_sock_nested(sk);
2104         tcp_segs_in(tcp_sk(sk), skb);
2105         ret = 0;
2106         if (!sock_owned_by_user(sk)) {
2107                 skb_to_free = sk->sk_rx_skb_cache;
2108                 sk->sk_rx_skb_cache = NULL;
2109                 ret = tcp_v4_do_rcv(sk, skb);
2110         } else {
2111                 if (tcp_add_backlog(sk, skb))
2112                         goto discard_and_relse;
2113                 skb_to_free = NULL;
2114         }
2115         bh_unlock_sock(sk);
2116         if (skb_to_free)
2117                 __kfree_skb(skb_to_free);
2118
2119 put_and_return:
2120         if (refcounted)
2121                 sock_put(sk);
2122
2123         return ret;
2124
2125 no_tcp_socket:
2126         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2127                 goto discard_it;
2128
2129         tcp_v4_fill_cb(skb, iph, th);
2130
2131         if (tcp_checksum_complete(skb)) {
2132 csum_error:
2133                 trace_tcp_bad_csum(skb);
2134                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2135 bad_packet:
2136                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2137         } else {
2138                 tcp_v4_send_reset(NULL, skb);
2139         }
2140
2141 discard_it:
2142         /* Discard frame. */
2143         kfree_skb(skb);
2144         return 0;
2145
2146 discard_and_relse:
2147         sk_drops_add(sk, skb);
2148         if (refcounted)
2149                 sock_put(sk);
2150         goto discard_it;
2151
2152 do_time_wait:
2153         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2154                 inet_twsk_put(inet_twsk(sk));
2155                 goto discard_it;
2156         }
2157
2158         tcp_v4_fill_cb(skb, iph, th);
2159
2160         if (tcp_checksum_complete(skb)) {
2161                 inet_twsk_put(inet_twsk(sk));
2162                 goto csum_error;
2163         }
2164         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2165         case TCP_TW_SYN: {
2166                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2167                                                         &tcp_hashinfo, skb,
2168                                                         __tcp_hdrlen(th),
2169                                                         iph->saddr, th->source,
2170                                                         iph->daddr, th->dest,
2171                                                         inet_iif(skb),
2172                                                         sdif);
2173                 if (sk2) {
2174                         inet_twsk_deschedule_put(inet_twsk(sk));
2175                         sk = sk2;
2176                         tcp_v4_restore_cb(skb);
2177                         refcounted = false;
2178                         goto process;
2179                 }
2180         }
2181                 /* to ACK */
2182                 fallthrough;
2183         case TCP_TW_ACK:
2184                 tcp_v4_timewait_ack(sk, skb);
2185                 break;
2186         case TCP_TW_RST:
2187                 tcp_v4_send_reset(sk, skb);
2188                 inet_twsk_deschedule_put(inet_twsk(sk));
2189                 goto discard_it;
2190         case TCP_TW_SUCCESS:;
2191         }
2192         goto discard_it;
2193 }
2194
2195 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2196         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2197         .twsk_unique    = tcp_twsk_unique,
2198         .twsk_destructor= tcp_twsk_destructor,
2199 };
2200
2201 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2202 {
2203         struct dst_entry *dst = skb_dst(skb);
2204
2205         if (dst && dst_hold_safe(dst)) {
2206                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2207                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2208         }
2209 }
2210 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2211
2212 const struct inet_connection_sock_af_ops ipv4_specific = {
2213         .queue_xmit        = ip_queue_xmit,
2214         .send_check        = tcp_v4_send_check,
2215         .rebuild_header    = inet_sk_rebuild_header,
2216         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2217         .conn_request      = tcp_v4_conn_request,
2218         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2219         .net_header_len    = sizeof(struct iphdr),
2220         .setsockopt        = ip_setsockopt,
2221         .getsockopt        = ip_getsockopt,
2222         .addr2sockaddr     = inet_csk_addr2sockaddr,
2223         .sockaddr_len      = sizeof(struct sockaddr_in),
2224         .mtu_reduced       = tcp_v4_mtu_reduced,
2225 };
2226 EXPORT_SYMBOL(ipv4_specific);
2227
2228 #ifdef CONFIG_TCP_MD5SIG
2229 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2230         .md5_lookup             = tcp_v4_md5_lookup,
2231         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2232         .md5_parse              = tcp_v4_parse_md5_keys,
2233 };
2234 #endif
2235
2236 /* NOTE: A lot of things set to zero explicitly by call to
2237  *       sk_alloc() so need not be done here.
2238  */
2239 static int tcp_v4_init_sock(struct sock *sk)
2240 {
2241         struct inet_connection_sock *icsk = inet_csk(sk);
2242
2243         tcp_init_sock(sk);
2244
2245         icsk->icsk_af_ops = &ipv4_specific;
2246
2247 #ifdef CONFIG_TCP_MD5SIG
2248         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2249 #endif
2250
2251         return 0;
2252 }
2253
2254 void tcp_v4_destroy_sock(struct sock *sk)
2255 {
2256         struct tcp_sock *tp = tcp_sk(sk);
2257
2258         trace_tcp_destroy_sock(sk);
2259
2260         tcp_clear_xmit_timers(sk);
2261
2262         tcp_cleanup_congestion_control(sk);
2263
2264         tcp_cleanup_ulp(sk);
2265
2266         /* Cleanup up the write buffer. */
2267         tcp_write_queue_purge(sk);
2268
2269         /* Check if we want to disable active TFO */
2270         tcp_fastopen_active_disable_ofo_check(sk);
2271
2272         /* Cleans up our, hopefully empty, out_of_order_queue. */
2273         skb_rbtree_purge(&tp->out_of_order_queue);
2274
2275 #ifdef CONFIG_TCP_MD5SIG
2276         /* Clean up the MD5 key list, if any */
2277         if (tp->md5sig_info) {
2278                 tcp_clear_md5_list(sk);
2279                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2280                 tp->md5sig_info = NULL;
2281         }
2282 #endif
2283
2284         /* Clean up a referenced TCP bind bucket. */
2285         if (inet_csk(sk)->icsk_bind_hash)
2286                 inet_put_port(sk);
2287
2288         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2289
2290         /* If socket is aborted during connect operation */
2291         tcp_free_fastopen_req(tp);
2292         tcp_fastopen_destroy_cipher(sk);
2293         tcp_saved_syn_free(tp);
2294
2295         sk_sockets_allocated_dec(sk);
2296 }
2297 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2298
2299 #ifdef CONFIG_PROC_FS
2300 /* Proc filesystem TCP sock list dumping. */
2301
2302 static unsigned short seq_file_family(const struct seq_file *seq);
2303
2304 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2305 {
2306         unsigned short family = seq_file_family(seq);
2307
2308         /* AF_UNSPEC is used as a match all */
2309         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2310                 net_eq(sock_net(sk), seq_file_net(seq)));
2311 }
2312
2313 /* Find a non empty bucket (starting from st->bucket)
2314  * and return the first sk from it.
2315  */
2316 static void *listening_get_first(struct seq_file *seq)
2317 {
2318         struct tcp_iter_state *st = seq->private;
2319
2320         st->offset = 0;
2321         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2322                 struct inet_listen_hashbucket *ilb2;
2323                 struct inet_connection_sock *icsk;
2324                 struct sock *sk;
2325
2326                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2327                 if (hlist_empty(&ilb2->head))
2328                         continue;
2329
2330                 spin_lock(&ilb2->lock);
2331                 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2332                         sk = (struct sock *)icsk;
2333                         if (seq_sk_match(seq, sk))
2334                                 return sk;
2335                 }
2336                 spin_unlock(&ilb2->lock);
2337         }
2338
2339         return NULL;
2340 }
2341
2342 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2343  * If "cur" is the last one in the st->bucket,
2344  * call listening_get_first() to return the first sk of the next
2345  * non empty bucket.
2346  */
2347 static void *listening_get_next(struct seq_file *seq, void *cur)
2348 {
2349         struct tcp_iter_state *st = seq->private;
2350         struct inet_listen_hashbucket *ilb2;
2351         struct inet_connection_sock *icsk;
2352         struct sock *sk = cur;
2353
2354         ++st->num;
2355         ++st->offset;
2356
2357         icsk = inet_csk(sk);
2358         inet_lhash2_for_each_icsk_continue(icsk) {
2359                 sk = (struct sock *)icsk;
2360                 if (seq_sk_match(seq, sk))
2361                         return sk;
2362         }
2363
2364         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2365         spin_unlock(&ilb2->lock);
2366         ++st->bucket;
2367         return listening_get_first(seq);
2368 }
2369
2370 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2371 {
2372         struct tcp_iter_state *st = seq->private;
2373         void *rc;
2374
2375         st->bucket = 0;
2376         st->offset = 0;
2377         rc = listening_get_first(seq);
2378
2379         while (rc && *pos) {
2380                 rc = listening_get_next(seq, rc);
2381                 --*pos;
2382         }
2383         return rc;
2384 }
2385
2386 static inline bool empty_bucket(const struct tcp_iter_state *st)
2387 {
2388         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2389 }
2390
2391 /*
2392  * Get first established socket starting from bucket given in st->bucket.
2393  * If st->bucket is zero, the very first socket in the hash is returned.
2394  */
2395 static void *established_get_first(struct seq_file *seq)
2396 {
2397         struct tcp_iter_state *st = seq->private;
2398
2399         st->offset = 0;
2400         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2401                 struct sock *sk;
2402                 struct hlist_nulls_node *node;
2403                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2404
2405                 /* Lockless fast path for the common case of empty buckets */
2406                 if (empty_bucket(st))
2407                         continue;
2408
2409                 spin_lock_bh(lock);
2410                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2411                         if (seq_sk_match(seq, sk))
2412                                 return sk;
2413                 }
2414                 spin_unlock_bh(lock);
2415         }
2416
2417         return NULL;
2418 }
2419
2420 static void *established_get_next(struct seq_file *seq, void *cur)
2421 {
2422         struct sock *sk = cur;
2423         struct hlist_nulls_node *node;
2424         struct tcp_iter_state *st = seq->private;
2425
2426         ++st->num;
2427         ++st->offset;
2428
2429         sk = sk_nulls_next(sk);
2430
2431         sk_nulls_for_each_from(sk, node) {
2432                 if (seq_sk_match(seq, sk))
2433                         return sk;
2434         }
2435
2436         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2437         ++st->bucket;
2438         return established_get_first(seq);
2439 }
2440
2441 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2442 {
2443         struct tcp_iter_state *st = seq->private;
2444         void *rc;
2445
2446         st->bucket = 0;
2447         rc = established_get_first(seq);
2448
2449         while (rc && pos) {
2450                 rc = established_get_next(seq, rc);
2451                 --pos;
2452         }
2453         return rc;
2454 }
2455
2456 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2457 {
2458         void *rc;
2459         struct tcp_iter_state *st = seq->private;
2460
2461         st->state = TCP_SEQ_STATE_LISTENING;
2462         rc        = listening_get_idx(seq, &pos);
2463
2464         if (!rc) {
2465                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2466                 rc        = established_get_idx(seq, pos);
2467         }
2468
2469         return rc;
2470 }
2471
2472 static void *tcp_seek_last_pos(struct seq_file *seq)
2473 {
2474         struct tcp_iter_state *st = seq->private;
2475         int bucket = st->bucket;
2476         int offset = st->offset;
2477         int orig_num = st->num;
2478         void *rc = NULL;
2479
2480         switch (st->state) {
2481         case TCP_SEQ_STATE_LISTENING:
2482                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2483                         break;
2484                 st->state = TCP_SEQ_STATE_LISTENING;
2485                 rc = listening_get_first(seq);
2486                 while (offset-- && rc && bucket == st->bucket)
2487                         rc = listening_get_next(seq, rc);
2488                 if (rc)
2489                         break;
2490                 st->bucket = 0;
2491                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2492                 fallthrough;
2493         case TCP_SEQ_STATE_ESTABLISHED:
2494                 if (st->bucket > tcp_hashinfo.ehash_mask)
2495                         break;
2496                 rc = established_get_first(seq);
2497                 while (offset-- && rc && bucket == st->bucket)
2498                         rc = established_get_next(seq, rc);
2499         }
2500
2501         st->num = orig_num;
2502
2503         return rc;
2504 }
2505
2506 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2507 {
2508         struct tcp_iter_state *st = seq->private;
2509         void *rc;
2510
2511         if (*pos && *pos == st->last_pos) {
2512                 rc = tcp_seek_last_pos(seq);
2513                 if (rc)
2514                         goto out;
2515         }
2516
2517         st->state = TCP_SEQ_STATE_LISTENING;
2518         st->num = 0;
2519         st->bucket = 0;
2520         st->offset = 0;
2521         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2522
2523 out:
2524         st->last_pos = *pos;
2525         return rc;
2526 }
2527 EXPORT_SYMBOL(tcp_seq_start);
2528
2529 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2530 {
2531         struct tcp_iter_state *st = seq->private;
2532         void *rc = NULL;
2533
2534         if (v == SEQ_START_TOKEN) {
2535                 rc = tcp_get_idx(seq, 0);
2536                 goto out;
2537         }
2538
2539         switch (st->state) {
2540         case TCP_SEQ_STATE_LISTENING:
2541                 rc = listening_get_next(seq, v);
2542                 if (!rc) {
2543                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2544                         st->bucket = 0;
2545                         st->offset = 0;
2546                         rc        = established_get_first(seq);
2547                 }
2548                 break;
2549         case TCP_SEQ_STATE_ESTABLISHED:
2550                 rc = established_get_next(seq, v);
2551                 break;
2552         }
2553 out:
2554         ++*pos;
2555         st->last_pos = *pos;
2556         return rc;
2557 }
2558 EXPORT_SYMBOL(tcp_seq_next);
2559
2560 void tcp_seq_stop(struct seq_file *seq, void *v)
2561 {
2562         struct tcp_iter_state *st = seq->private;
2563
2564         switch (st->state) {
2565         case TCP_SEQ_STATE_LISTENING:
2566                 if (v != SEQ_START_TOKEN)
2567                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2568                 break;
2569         case TCP_SEQ_STATE_ESTABLISHED:
2570                 if (v)
2571                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2572                 break;
2573         }
2574 }
2575 EXPORT_SYMBOL(tcp_seq_stop);
2576
2577 static void get_openreq4(const struct request_sock *req,
2578                          struct seq_file *f, int i)
2579 {
2580         const struct inet_request_sock *ireq = inet_rsk(req);
2581         long delta = req->rsk_timer.expires - jiffies;
2582
2583         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2584                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2585                 i,
2586                 ireq->ir_loc_addr,
2587                 ireq->ir_num,
2588                 ireq->ir_rmt_addr,
2589                 ntohs(ireq->ir_rmt_port),
2590                 TCP_SYN_RECV,
2591                 0, 0, /* could print option size, but that is af dependent. */
2592                 1,    /* timers active (only the expire timer) */
2593                 jiffies_delta_to_clock_t(delta),
2594                 req->num_timeout,
2595                 from_kuid_munged(seq_user_ns(f),
2596                                  sock_i_uid(req->rsk_listener)),
2597                 0,  /* non standard timer */
2598                 0, /* open_requests have no inode */
2599                 0,
2600                 req);
2601 }
2602
2603 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2604 {
2605         int timer_active;
2606         unsigned long timer_expires;
2607         const struct tcp_sock *tp = tcp_sk(sk);
2608         const struct inet_connection_sock *icsk = inet_csk(sk);
2609         const struct inet_sock *inet = inet_sk(sk);
2610         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2611         __be32 dest = inet->inet_daddr;
2612         __be32 src = inet->inet_rcv_saddr;
2613         __u16 destp = ntohs(inet->inet_dport);
2614         __u16 srcp = ntohs(inet->inet_sport);
2615         int rx_queue;
2616         int state;
2617
2618         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2619             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2620             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2621                 timer_active    = 1;
2622                 timer_expires   = icsk->icsk_timeout;
2623         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2624                 timer_active    = 4;
2625                 timer_expires   = icsk->icsk_timeout;
2626         } else if (timer_pending(&sk->sk_timer)) {
2627                 timer_active    = 2;
2628                 timer_expires   = sk->sk_timer.expires;
2629         } else {
2630                 timer_active    = 0;
2631                 timer_expires = jiffies;
2632         }
2633
2634         state = inet_sk_state_load(sk);
2635         if (state == TCP_LISTEN)
2636                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2637         else
2638                 /* Because we don't lock the socket,
2639                  * we might find a transient negative value.
2640                  */
2641                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2642                                       READ_ONCE(tp->copied_seq), 0);
2643
2644         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2645                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2646                 i, src, srcp, dest, destp, state,
2647                 READ_ONCE(tp->write_seq) - tp->snd_una,
2648                 rx_queue,
2649                 timer_active,
2650                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2651                 icsk->icsk_retransmits,
2652                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2653                 icsk->icsk_probes_out,
2654                 sock_i_ino(sk),
2655                 refcount_read(&sk->sk_refcnt), sk,
2656                 jiffies_to_clock_t(icsk->icsk_rto),
2657                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2658                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2659                 tcp_snd_cwnd(tp),
2660                 state == TCP_LISTEN ?
2661                     fastopenq->max_qlen :
2662                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2663 }
2664
2665 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2666                                struct seq_file *f, int i)
2667 {
2668         long delta = tw->tw_timer.expires - jiffies;
2669         __be32 dest, src;
2670         __u16 destp, srcp;
2671
2672         dest  = tw->tw_daddr;
2673         src   = tw->tw_rcv_saddr;
2674         destp = ntohs(tw->tw_dport);
2675         srcp  = ntohs(tw->tw_sport);
2676
2677         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2678                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2679                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2680                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2681                 refcount_read(&tw->tw_refcnt), tw);
2682 }
2683
2684 #define TMPSZ 150
2685
2686 static int tcp4_seq_show(struct seq_file *seq, void *v)
2687 {
2688         struct tcp_iter_state *st;
2689         struct sock *sk = v;
2690
2691         seq_setwidth(seq, TMPSZ - 1);
2692         if (v == SEQ_START_TOKEN) {
2693                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2694                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2695                            "inode");
2696                 goto out;
2697         }
2698         st = seq->private;
2699
2700         if (sk->sk_state == TCP_TIME_WAIT)
2701                 get_timewait4_sock(v, seq, st->num);
2702         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2703                 get_openreq4(v, seq, st->num);
2704         else
2705                 get_tcp4_sock(v, seq, st->num);
2706 out:
2707         seq_pad(seq, '\n');
2708         return 0;
2709 }
2710
2711 #ifdef CONFIG_BPF_SYSCALL
2712 struct bpf_tcp_iter_state {
2713         struct tcp_iter_state state;
2714         unsigned int cur_sk;
2715         unsigned int end_sk;
2716         unsigned int max_sk;
2717         struct sock **batch;
2718         bool st_bucket_done;
2719 };
2720
2721 struct bpf_iter__tcp {
2722         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2723         __bpf_md_ptr(struct sock_common *, sk_common);
2724         uid_t uid __aligned(8);
2725 };
2726
2727 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2728                              struct sock_common *sk_common, uid_t uid)
2729 {
2730         struct bpf_iter__tcp ctx;
2731
2732         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2733         ctx.meta = meta;
2734         ctx.sk_common = sk_common;
2735         ctx.uid = uid;
2736         return bpf_iter_run_prog(prog, &ctx);
2737 }
2738
2739 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2740 {
2741         while (iter->cur_sk < iter->end_sk)
2742                 sock_put(iter->batch[iter->cur_sk++]);
2743 }
2744
2745 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2746                                       unsigned int new_batch_sz)
2747 {
2748         struct sock **new_batch;
2749
2750         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2751                              GFP_USER | __GFP_NOWARN);
2752         if (!new_batch)
2753                 return -ENOMEM;
2754
2755         bpf_iter_tcp_put_batch(iter);
2756         kvfree(iter->batch);
2757         iter->batch = new_batch;
2758         iter->max_sk = new_batch_sz;
2759
2760         return 0;
2761 }
2762
2763 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2764                                                  struct sock *start_sk)
2765 {
2766         struct bpf_tcp_iter_state *iter = seq->private;
2767         struct tcp_iter_state *st = &iter->state;
2768         struct inet_connection_sock *icsk;
2769         unsigned int expected = 1;
2770         struct sock *sk;
2771
2772         sock_hold(start_sk);
2773         iter->batch[iter->end_sk++] = start_sk;
2774
2775         icsk = inet_csk(start_sk);
2776         inet_lhash2_for_each_icsk_continue(icsk) {
2777                 sk = (struct sock *)icsk;
2778                 if (seq_sk_match(seq, sk)) {
2779                         if (iter->end_sk < iter->max_sk) {
2780                                 sock_hold(sk);
2781                                 iter->batch[iter->end_sk++] = sk;
2782                         }
2783                         expected++;
2784                 }
2785         }
2786         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2787
2788         return expected;
2789 }
2790
2791 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2792                                                    struct sock *start_sk)
2793 {
2794         struct bpf_tcp_iter_state *iter = seq->private;
2795         struct tcp_iter_state *st = &iter->state;
2796         struct hlist_nulls_node *node;
2797         unsigned int expected = 1;
2798         struct sock *sk;
2799
2800         sock_hold(start_sk);
2801         iter->batch[iter->end_sk++] = start_sk;
2802
2803         sk = sk_nulls_next(start_sk);
2804         sk_nulls_for_each_from(sk, node) {
2805                 if (seq_sk_match(seq, sk)) {
2806                         if (iter->end_sk < iter->max_sk) {
2807                                 sock_hold(sk);
2808                                 iter->batch[iter->end_sk++] = sk;
2809                         }
2810                         expected++;
2811                 }
2812         }
2813         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2814
2815         return expected;
2816 }
2817
2818 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2819 {
2820         struct bpf_tcp_iter_state *iter = seq->private;
2821         struct tcp_iter_state *st = &iter->state;
2822         unsigned int expected;
2823         bool resized = false;
2824         struct sock *sk;
2825
2826         /* The st->bucket is done.  Directly advance to the next
2827          * bucket instead of having the tcp_seek_last_pos() to skip
2828          * one by one in the current bucket and eventually find out
2829          * it has to advance to the next bucket.
2830          */
2831         if (iter->st_bucket_done) {
2832                 st->offset = 0;
2833                 st->bucket++;
2834                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2835                     st->bucket > tcp_hashinfo.lhash2_mask) {
2836                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2837                         st->bucket = 0;
2838                 }
2839         }
2840
2841 again:
2842         /* Get a new batch */
2843         iter->cur_sk = 0;
2844         iter->end_sk = 0;
2845         iter->st_bucket_done = false;
2846
2847         sk = tcp_seek_last_pos(seq);
2848         if (!sk)
2849                 return NULL; /* Done */
2850
2851         if (st->state == TCP_SEQ_STATE_LISTENING)
2852                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2853         else
2854                 expected = bpf_iter_tcp_established_batch(seq, sk);
2855
2856         if (iter->end_sk == expected) {
2857                 iter->st_bucket_done = true;
2858                 return sk;
2859         }
2860
2861         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2862                 resized = true;
2863                 goto again;
2864         }
2865
2866         return sk;
2867 }
2868
2869 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2870 {
2871         /* bpf iter does not support lseek, so it always
2872          * continue from where it was stop()-ped.
2873          */
2874         if (*pos)
2875                 return bpf_iter_tcp_batch(seq);
2876
2877         return SEQ_START_TOKEN;
2878 }
2879
2880 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2881 {
2882         struct bpf_tcp_iter_state *iter = seq->private;
2883         struct tcp_iter_state *st = &iter->state;
2884         struct sock *sk;
2885
2886         /* Whenever seq_next() is called, the iter->cur_sk is
2887          * done with seq_show(), so advance to the next sk in
2888          * the batch.
2889          */
2890         if (iter->cur_sk < iter->end_sk) {
2891                 /* Keeping st->num consistent in tcp_iter_state.
2892                  * bpf_iter_tcp does not use st->num.
2893                  * meta.seq_num is used instead.
2894                  */
2895                 st->num++;
2896                 /* Move st->offset to the next sk in the bucket such that
2897                  * the future start() will resume at st->offset in
2898                  * st->bucket.  See tcp_seek_last_pos().
2899                  */
2900                 st->offset++;
2901                 sock_put(iter->batch[iter->cur_sk++]);
2902         }
2903
2904         if (iter->cur_sk < iter->end_sk)
2905                 sk = iter->batch[iter->cur_sk];
2906         else
2907                 sk = bpf_iter_tcp_batch(seq);
2908
2909         ++*pos;
2910         /* Keeping st->last_pos consistent in tcp_iter_state.
2911          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2912          */
2913         st->last_pos = *pos;
2914         return sk;
2915 }
2916
2917 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2918 {
2919         struct bpf_iter_meta meta;
2920         struct bpf_prog *prog;
2921         struct sock *sk = v;
2922         bool slow;
2923         uid_t uid;
2924         int ret;
2925
2926         if (v == SEQ_START_TOKEN)
2927                 return 0;
2928
2929         if (sk_fullsock(sk))
2930                 slow = lock_sock_fast(sk);
2931
2932         if (unlikely(sk_unhashed(sk))) {
2933                 ret = SEQ_SKIP;
2934                 goto unlock;
2935         }
2936
2937         if (sk->sk_state == TCP_TIME_WAIT) {
2938                 uid = 0;
2939         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2940                 const struct request_sock *req = v;
2941
2942                 uid = from_kuid_munged(seq_user_ns(seq),
2943                                        sock_i_uid(req->rsk_listener));
2944         } else {
2945                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2946         }
2947
2948         meta.seq = seq;
2949         prog = bpf_iter_get_info(&meta, false);
2950         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2951
2952 unlock:
2953         if (sk_fullsock(sk))
2954                 unlock_sock_fast(sk, slow);
2955         return ret;
2956
2957 }
2958
2959 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2960 {
2961         struct bpf_tcp_iter_state *iter = seq->private;
2962         struct bpf_iter_meta meta;
2963         struct bpf_prog *prog;
2964
2965         if (!v) {
2966                 meta.seq = seq;
2967                 prog = bpf_iter_get_info(&meta, true);
2968                 if (prog)
2969                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2970         }
2971
2972         if (iter->cur_sk < iter->end_sk) {
2973                 bpf_iter_tcp_put_batch(iter);
2974                 iter->st_bucket_done = false;
2975         }
2976 }
2977
2978 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2979         .show           = bpf_iter_tcp_seq_show,
2980         .start          = bpf_iter_tcp_seq_start,
2981         .next           = bpf_iter_tcp_seq_next,
2982         .stop           = bpf_iter_tcp_seq_stop,
2983 };
2984 #endif
2985 static unsigned short seq_file_family(const struct seq_file *seq)
2986 {
2987         const struct tcp_seq_afinfo *afinfo;
2988
2989 #ifdef CONFIG_BPF_SYSCALL
2990         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2991         if (seq->op == &bpf_iter_tcp_seq_ops)
2992                 return AF_UNSPEC;
2993 #endif
2994
2995         /* Iterated from proc fs */
2996         afinfo = PDE_DATA(file_inode(seq->file));
2997         return afinfo->family;
2998 }
2999
3000 static const struct seq_operations tcp4_seq_ops = {
3001         .show           = tcp4_seq_show,
3002         .start          = tcp_seq_start,
3003         .next           = tcp_seq_next,
3004         .stop           = tcp_seq_stop,
3005 };
3006
3007 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3008         .family         = AF_INET,
3009 };
3010
3011 static int __net_init tcp4_proc_init_net(struct net *net)
3012 {
3013         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3014                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3015                 return -ENOMEM;
3016         return 0;
3017 }
3018
3019 static void __net_exit tcp4_proc_exit_net(struct net *net)
3020 {
3021         remove_proc_entry("tcp", net->proc_net);
3022 }
3023
3024 static struct pernet_operations tcp4_net_ops = {
3025         .init = tcp4_proc_init_net,
3026         .exit = tcp4_proc_exit_net,
3027 };
3028
3029 int __init tcp4_proc_init(void)
3030 {
3031         return register_pernet_subsys(&tcp4_net_ops);
3032 }
3033
3034 void tcp4_proc_exit(void)
3035 {
3036         unregister_pernet_subsys(&tcp4_net_ops);
3037 }
3038 #endif /* CONFIG_PROC_FS */
3039
3040 /* @wake is one when sk_stream_write_space() calls us.
3041  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3042  * This mimics the strategy used in sock_def_write_space().
3043  */
3044 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3045 {
3046         const struct tcp_sock *tp = tcp_sk(sk);
3047         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3048                             READ_ONCE(tp->snd_nxt);
3049
3050         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3051 }
3052 EXPORT_SYMBOL(tcp_stream_memory_free);
3053
3054 struct proto tcp_prot = {
3055         .name                   = "TCP",
3056         .owner                  = THIS_MODULE,
3057         .close                  = tcp_close,
3058         .pre_connect            = tcp_v4_pre_connect,
3059         .connect                = tcp_v4_connect,
3060         .disconnect             = tcp_disconnect,
3061         .accept                 = inet_csk_accept,
3062         .ioctl                  = tcp_ioctl,
3063         .init                   = tcp_v4_init_sock,
3064         .destroy                = tcp_v4_destroy_sock,
3065         .shutdown               = tcp_shutdown,
3066         .setsockopt             = tcp_setsockopt,
3067         .getsockopt             = tcp_getsockopt,
3068         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3069         .keepalive              = tcp_set_keepalive,
3070         .recvmsg                = tcp_recvmsg,
3071         .sendmsg                = tcp_sendmsg,
3072         .sendpage               = tcp_sendpage,
3073         .backlog_rcv            = tcp_v4_do_rcv,
3074         .release_cb             = tcp_release_cb,
3075         .hash                   = inet_hash,
3076         .unhash                 = inet_unhash,
3077         .get_port               = inet_csk_get_port,
3078 #ifdef CONFIG_BPF_SYSCALL
3079         .psock_update_sk_prot   = tcp_bpf_update_proto,
3080 #endif
3081         .enter_memory_pressure  = tcp_enter_memory_pressure,
3082         .leave_memory_pressure  = tcp_leave_memory_pressure,
3083         .stream_memory_free     = tcp_stream_memory_free,
3084         .sockets_allocated      = &tcp_sockets_allocated,
3085         .orphan_count           = &tcp_orphan_count,
3086         .memory_allocated       = &tcp_memory_allocated,
3087         .memory_pressure        = &tcp_memory_pressure,
3088         .sysctl_mem             = sysctl_tcp_mem,
3089         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3090         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3091         .max_header             = MAX_TCP_HEADER,
3092         .obj_size               = sizeof(struct tcp_sock),
3093         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3094         .twsk_prot              = &tcp_timewait_sock_ops,
3095         .rsk_prot               = &tcp_request_sock_ops,
3096         .h.hashinfo             = &tcp_hashinfo,
3097         .no_autobind            = true,
3098         .diag_destroy           = tcp_abort,
3099 };
3100 EXPORT_SYMBOL(tcp_prot);
3101
3102 static void __net_exit tcp_sk_exit(struct net *net)
3103 {
3104         int cpu;
3105
3106         if (net->ipv4.tcp_congestion_control)
3107                 bpf_module_put(net->ipv4.tcp_congestion_control,
3108                                net->ipv4.tcp_congestion_control->owner);
3109
3110         for_each_possible_cpu(cpu)
3111                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3112         free_percpu(net->ipv4.tcp_sk);
3113 }
3114
3115 static int __net_init tcp_sk_init(struct net *net)
3116 {
3117         int res, cpu, cnt;
3118
3119         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3120         if (!net->ipv4.tcp_sk)
3121                 return -ENOMEM;
3122
3123         for_each_possible_cpu(cpu) {
3124                 struct sock *sk;
3125
3126                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3127                                            IPPROTO_TCP, net);
3128                 if (res)
3129                         goto fail;
3130                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3131
3132                 /* Please enforce IP_DF and IPID==0 for RST and
3133                  * ACK sent in SYN-RECV and TIME-WAIT state.
3134                  */
3135                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3136
3137                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3138         }
3139
3140         net->ipv4.sysctl_tcp_ecn = 2;
3141         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3142
3143         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3144         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3145         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3146         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3147         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3148
3149         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3150         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3151         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3152
3153         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3154         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3155         net->ipv4.sysctl_tcp_syncookies = 1;
3156         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3157         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3158         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3159         net->ipv4.sysctl_tcp_orphan_retries = 0;
3160         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3161         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3162         net->ipv4.sysctl_tcp_tw_reuse = 2;
3163         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3164
3165         cnt = tcp_hashinfo.ehash_mask + 1;
3166         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3167         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3168
3169         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3170         net->ipv4.sysctl_tcp_sack = 1;
3171         net->ipv4.sysctl_tcp_window_scaling = 1;
3172         net->ipv4.sysctl_tcp_timestamps = 1;
3173         net->ipv4.sysctl_tcp_early_retrans = 3;
3174         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3175         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3176         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3177         net->ipv4.sysctl_tcp_max_reordering = 300;
3178         net->ipv4.sysctl_tcp_dsack = 1;
3179         net->ipv4.sysctl_tcp_app_win = 31;
3180         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3181         net->ipv4.sysctl_tcp_frto = 2;
3182         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3183         /* This limits the percentage of the congestion window which we
3184          * will allow a single TSO frame to consume.  Building TSO frames
3185          * which are too large can cause TCP streams to be bursty.
3186          */
3187         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3188         /* Default TSQ limit of 16 TSO segments */
3189         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3190         /* rfc5961 challenge ack rate limiting */
3191         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3192         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3193         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3194         net->ipv4.sysctl_tcp_autocorking = 1;
3195         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3196         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3197         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3198         if (net != &init_net) {
3199                 memcpy(net->ipv4.sysctl_tcp_rmem,
3200                        init_net.ipv4.sysctl_tcp_rmem,
3201                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3202                 memcpy(net->ipv4.sysctl_tcp_wmem,
3203                        init_net.ipv4.sysctl_tcp_wmem,
3204                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3205         }
3206         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3207         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3208         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3209         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3210         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3211         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3212
3213         /* Reno is always built in */
3214         if (!net_eq(net, &init_net) &&
3215             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3216                                init_net.ipv4.tcp_congestion_control->owner))
3217                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3218         else
3219                 net->ipv4.tcp_congestion_control = &tcp_reno;
3220
3221         return 0;
3222 fail:
3223         tcp_sk_exit(net);
3224
3225         return res;
3226 }
3227
3228 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3229 {
3230         struct net *net;
3231
3232         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3233
3234         list_for_each_entry(net, net_exit_list, exit_list)
3235                 tcp_fastopen_ctx_destroy(net);
3236 }
3237
3238 static struct pernet_operations __net_initdata tcp_sk_ops = {
3239        .init       = tcp_sk_init,
3240        .exit       = tcp_sk_exit,
3241        .exit_batch = tcp_sk_exit_batch,
3242 };
3243
3244 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3245 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3246                      struct sock_common *sk_common, uid_t uid)
3247
3248 #define INIT_BATCH_SZ 16
3249
3250 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3251 {
3252         struct bpf_tcp_iter_state *iter = priv_data;
3253         int err;
3254
3255         err = bpf_iter_init_seq_net(priv_data, aux);
3256         if (err)
3257                 return err;
3258
3259         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3260         if (err) {
3261                 bpf_iter_fini_seq_net(priv_data);
3262                 return err;
3263         }
3264
3265         return 0;
3266 }
3267
3268 static void bpf_iter_fini_tcp(void *priv_data)
3269 {
3270         struct bpf_tcp_iter_state *iter = priv_data;
3271
3272         bpf_iter_fini_seq_net(priv_data);
3273         kvfree(iter->batch);
3274 }
3275
3276 static const struct bpf_iter_seq_info tcp_seq_info = {
3277         .seq_ops                = &bpf_iter_tcp_seq_ops,
3278         .init_seq_private       = bpf_iter_init_tcp,
3279         .fini_seq_private       = bpf_iter_fini_tcp,
3280         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3281 };
3282
3283 static const struct bpf_func_proto *
3284 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3285                             const struct bpf_prog *prog)
3286 {
3287         switch (func_id) {
3288         case BPF_FUNC_setsockopt:
3289                 return &bpf_sk_setsockopt_proto;
3290         case BPF_FUNC_getsockopt:
3291                 return &bpf_sk_getsockopt_proto;
3292         default:
3293                 return NULL;
3294         }
3295 }
3296
3297 static struct bpf_iter_reg tcp_reg_info = {
3298         .target                 = "tcp",
3299         .ctx_arg_info_size      = 1,
3300         .ctx_arg_info           = {
3301                 { offsetof(struct bpf_iter__tcp, sk_common),
3302                   PTR_TO_BTF_ID_OR_NULL },
3303         },
3304         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3305         .seq_info               = &tcp_seq_info,
3306 };
3307
3308 static void __init bpf_iter_register(void)
3309 {
3310         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3311         if (bpf_iter_reg_target(&tcp_reg_info))
3312                 pr_warn("Warning: could not register bpf iterator tcp\n");
3313 }
3314
3315 #endif
3316
3317 void __init tcp_v4_init(void)
3318 {
3319         if (register_pernet_subsys(&tcp_sk_ops))
3320                 panic("Failed to create the TCP control socket.\n");
3321
3322 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3323         bpf_iter_register();
3324 #endif
3325 }