net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/udp.h>
 111 #include <linux/init.h>
 112 #include <linux/highmem.h>
 113 #include <linux/user_namespace.h>
 114 #include <linux/static_key.h>
 115 #include <linux/memcontrol.h>
 116 #include <linux/prefetch.h>
 117 #include <linux/compat.h>
 118 #include <linux/mroute.h>
 119 #include <linux/mroute6.h>
 120 #include <linux/icmpv6.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <net/net_namespace.h>
 128 #include <net/request_sock.h>
 129 #include <net/sock.h>
 130 #include <linux/net_tstamp.h>
 131 #include <net/xfrm.h>
 132 #include <linux/ipsec.h>
 133 #include <net/cls_cgroup.h>
 134 #include <net/netprio_cgroup.h>
 135 #include <linux/sock_diag.h>
 136
 137 #include <linux/filter.h>
 138 #include <net/sock_reuseport.h>
 139 #include <net/bpf_sk_storage.h>
 140
 141 #include <trace/events/sock.h>
 142
 143 #include <net/tcp.h>
 144 #include <net/busy_poll.h>
 145 #include <net/phonet/phonet.h>
 146
 147 #include <linux/ethtool.h>
 148
 149 #include "dev.h"
 150
 151 static DEFINE_MUTEX(proto_list_mutex);
 152 static LIST_HEAD(proto_list);
 153
 154 static void sock_def_write_space_wfree(struct sock *sk);
 155 static void sock_def_write_space(struct sock *sk);
 156
 157 /**
 158  * sk_ns_capable - General socket capability test
 159  * @sk: Socket to use a capability on or through
 160  * @user_ns: The user namespace of the capability to use
 161  * @cap: The capability to use
 162  *
 163  * Test to see if the opener of the socket had when the socket was
 164  * created and the current process has the capability @cap in the user
 165  * namespace @user_ns.
 166  */
 167 bool sk_ns_capable(const struct sock *sk,
 168                    struct user_namespace *user_ns, int cap)
 169 {
 170         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 171                 ns_capable(user_ns, cap);
 172 }
 173 EXPORT_SYMBOL(sk_ns_capable);
 174
 175 /**
 176  * sk_capable - Socket global capability test
 177  * @sk: Socket to use a capability on or through
 178  * @cap: The global capability to use
 179  *
 180  * Test to see if the opener of the socket had when the socket was
 181  * created and the current process has the capability @cap in all user
 182  * namespaces.
 183  */
 184 bool sk_capable(const struct sock *sk, int cap)
 185 {
 186         return sk_ns_capable(sk, &init_user_ns, cap);
 187 }
 188 EXPORT_SYMBOL(sk_capable);
 189
 190 /**
 191  * sk_net_capable - Network namespace socket capability test
 192  * @sk: Socket to use a capability on or through
 193  * @cap: The capability to use
 194  *
 195  * Test to see if the opener of the socket had when the socket was created
 196  * and the current process has the capability @cap over the network namespace
 197  * the socket is a member of.
 198  */
 199 bool sk_net_capable(const struct sock *sk, int cap)
 200 {
 201         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 202 }
 203 EXPORT_SYMBOL(sk_net_capable);
 204
 205 /*
 206  * Each address family might have different locking rules, so we have
 207  * one slock key per address family and separate keys for internal and
 208  * userspace sockets.
 209  */
 210 static struct lock_class_key af_family_keys[AF_MAX];
 211 static struct lock_class_key af_family_kern_keys[AF_MAX];
 212 static struct lock_class_key af_family_slock_keys[AF_MAX];
 213 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 214
 215 /*
 216  * Make lock validator output more readable. (we pre-construct these
 217  * strings build-time, so that runtime initialization of socket
 218  * locks is fast):
 219  */
 220
 221 #define _sock_locks(x)                                            \
 222   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 223   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 224   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 225   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 226   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 227   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 228   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 229   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 230   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 231   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 232   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 233   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 234   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 235   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 236   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 237   x "AF_MCTP"  , \
 238   x "AF_MAX"
 239
 240 static const char *const af_family_key_strings[AF_MAX+1] = {
 241         _sock_locks("sk_lock-")
 242 };
 243 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 244         _sock_locks("slock-")
 245 };
 246 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 247         _sock_locks("clock-")
 248 };
 249
 250 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 251         _sock_locks("k-sk_lock-")
 252 };
 253 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 254         _sock_locks("k-slock-")
 255 };
 256 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 257         _sock_locks("k-clock-")
 258 };
 259 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 260         _sock_locks("rlock-")
 261 };
 262 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 263         _sock_locks("wlock-")
 264 };
 265 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 266         _sock_locks("elock-")
 267 };
 268
 269 /*
 270  * sk_callback_lock and sk queues locking rules are per-address-family,
 271  * so split the lock classes by using a per-AF key:
 272  */
 273 static struct lock_class_key af_callback_keys[AF_MAX];
 274 static struct lock_class_key af_rlock_keys[AF_MAX];
 275 static struct lock_class_key af_wlock_keys[AF_MAX];
 276 static struct lock_class_key af_elock_keys[AF_MAX];
 277 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 278
 279 /* Run time adjustable parameters. */
 280 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 281 EXPORT_SYMBOL(sysctl_wmem_max);
 282 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 283 EXPORT_SYMBOL(sysctl_rmem_max);
 284 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 285 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 286
 287 /* Maximal space eaten by iovec or ancillary data plus some space */
 288 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 289 EXPORT_SYMBOL(sysctl_optmem_max);
 290
 291 int sysctl_tstamp_allow_data __read_mostly = 1;
 292
 293 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 294 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 295
 296 /**
 297  * sk_set_memalloc - sets %SOCK_MEMALLOC
 298  * @sk: socket to set it on
 299  *
 300  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 301  * It's the responsibility of the admin to adjust min_free_kbytes
 302  * to meet the requirements
 303  */
 304 void sk_set_memalloc(struct sock *sk)
 305 {
 306         sock_set_flag(sk, SOCK_MEMALLOC);
 307         sk->sk_allocation |= __GFP_MEMALLOC;
 308         static_branch_inc(&memalloc_socks_key);
 309 }
 310 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 311
 312 void sk_clear_memalloc(struct sock *sk)
 313 {
 314         sock_reset_flag(sk, SOCK_MEMALLOC);
 315         sk->sk_allocation &= ~__GFP_MEMALLOC;
 316         static_branch_dec(&memalloc_socks_key);
 317
 318         /*
 319          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 320          * progress of swapping. SOCK_MEMALLOC may be cleared while
 321          * it has rmem allocations due to the last swapfile being deactivated
 322          * but there is a risk that the socket is unusable due to exceeding
 323          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 324          */
 325         sk_mem_reclaim(sk);
 326 }
 327 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 328
 329 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 330 {
 331         int ret;
 332         unsigned int noreclaim_flag;
 333
 334         /* these should have been dropped before queueing */
 335         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 336
 337         noreclaim_flag = memalloc_noreclaim_save();
 338         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 339                                  tcp_v6_do_rcv,
 340                                  tcp_v4_do_rcv,
 341                                  sk, skb);
 342         memalloc_noreclaim_restore(noreclaim_flag);
 343
 344         return ret;
 345 }
 346 EXPORT_SYMBOL(__sk_backlog_rcv);
 347
 348 void sk_error_report(struct sock *sk)
 349 {
 350         sk->sk_error_report(sk);
 351
 352         switch (sk->sk_family) {
 353         case AF_INET:
 354                 fallthrough;
 355         case AF_INET6:
 356                 trace_inet_sk_error_report(sk);
 357                 break;
 358         default:
 359                 break;
 360         }
 361 }
 362 EXPORT_SYMBOL(sk_error_report);
 363
 364 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 365 {
 366         struct __kernel_sock_timeval tv;
 367
 368         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 369                 tv.tv_sec = 0;
 370                 tv.tv_usec = 0;
 371         } else {
 372                 tv.tv_sec = timeo / HZ;
 373                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 374         }
 375
 376         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 377                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 378                 *(struct old_timeval32 *)optval = tv32;
 379                 return sizeof(tv32);
 380         }
 381
 382         if (old_timeval) {
 383                 struct __kernel_old_timeval old_tv;
 384                 old_tv.tv_sec = tv.tv_sec;
 385                 old_tv.tv_usec = tv.tv_usec;
 386                 *(struct __kernel_old_timeval *)optval = old_tv;
 387                 return sizeof(old_tv);
 388         }
 389
 390         *(struct __kernel_sock_timeval *)optval = tv;
 391         return sizeof(tv);
 392 }
 393 EXPORT_SYMBOL(sock_get_timeout);
 394
 395 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 396                            sockptr_t optval, int optlen, bool old_timeval)
 397 {
 398         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 399                 struct old_timeval32 tv32;
 400
 401                 if (optlen < sizeof(tv32))
 402                         return -EINVAL;
 403
 404                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 405                         return -EFAULT;
 406                 tv->tv_sec = tv32.tv_sec;
 407                 tv->tv_usec = tv32.tv_usec;
 408         } else if (old_timeval) {
 409                 struct __kernel_old_timeval old_tv;
 410
 411                 if (optlen < sizeof(old_tv))
 412                         return -EINVAL;
 413                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 414                         return -EFAULT;
 415                 tv->tv_sec = old_tv.tv_sec;
 416                 tv->tv_usec = old_tv.tv_usec;
 417         } else {
 418                 if (optlen < sizeof(*tv))
 419                         return -EINVAL;
 420                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 421                         return -EFAULT;
 422         }
 423
 424         return 0;
 425 }
 426 EXPORT_SYMBOL(sock_copy_user_timeval);
 427
 428 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 429                             bool old_timeval)
 430 {
 431         struct __kernel_sock_timeval tv;
 432         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 433         long val;
 434
 435         if (err)
 436                 return err;
 437
 438         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 439                 return -EDOM;
 440
 441         if (tv.tv_sec < 0) {
 442                 static int warned __read_mostly;
 443
 444                 WRITE_ONCE(*timeo_p, 0);
 445                 if (warned < 10 && net_ratelimit()) {
 446                         warned++;
 447                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 448                                 __func__, current->comm, task_pid_nr(current));
 449                 }
 450                 return 0;
 451         }
 452         val = MAX_SCHEDULE_TIMEOUT;
 453         if ((tv.tv_sec || tv.tv_usec) &&
 454             (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 455                 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 456                                                     USEC_PER_SEC / HZ);
 457         WRITE_ONCE(*timeo_p, val);
 458         return 0;
 459 }
 460
 461 static bool sock_needs_netstamp(const struct sock *sk)
 462 {
 463         switch (sk->sk_family) {
 464         case AF_UNSPEC:
 465         case AF_UNIX:
 466                 return false;
 467         default:
 468                 return true;
 469         }
 470 }
 471
 472 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 473 {
 474         if (sk->sk_flags & flags) {
 475                 sk->sk_flags &= ~flags;
 476                 if (sock_needs_netstamp(sk) &&
 477                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 478                         net_disable_timestamp();
 479         }
 480 }
 481
 482
 483 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 484 {
 485         unsigned long flags;
 486         struct sk_buff_head *list = &sk->sk_receive_queue;
 487
 488         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 489                 atomic_inc(&sk->sk_drops);
 490                 trace_sock_rcvqueue_full(sk, skb);
 491                 return -ENOMEM;
 492         }
 493
 494         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 495                 atomic_inc(&sk->sk_drops);
 496                 return -ENOBUFS;
 497         }
 498
 499         skb->dev = NULL;
 500         skb_set_owner_r(skb, sk);
 501
 502         /* we escape from rcu protected region, make sure we dont leak
 503          * a norefcounted dst
 504          */
 505         skb_dst_force(skb);
 506
 507         spin_lock_irqsave(&list->lock, flags);
 508         sock_skb_set_dropcount(sk, skb);
 509         __skb_queue_tail(list, skb);
 510         spin_unlock_irqrestore(&list->lock, flags);
 511
 512         if (!sock_flag(sk, SOCK_DEAD))
 513                 sk->sk_data_ready(sk);
 514         return 0;
 515 }
 516 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 517
 518 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 519                               enum skb_drop_reason *reason)
 520 {
 521         enum skb_drop_reason drop_reason;
 522         int err;
 523
 524         err = sk_filter(sk, skb);
 525         if (err) {
 526                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 527                 goto out;
 528         }
 529         err = __sock_queue_rcv_skb(sk, skb);
 530         switch (err) {
 531         case -ENOMEM:
 532                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 533                 break;
 534         case -ENOBUFS:
 535                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 536                 break;
 537         default:
 538                 drop_reason = SKB_NOT_DROPPED_YET;
 539                 break;
 540         }
 541 out:
 542         if (reason)
 543                 *reason = drop_reason;
 544         return err;
 545 }
 546 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 547
 548 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 549                      const int nested, unsigned int trim_cap, bool refcounted)
 550 {
 551         int rc = NET_RX_SUCCESS;
 552
 553         if (sk_filter_trim_cap(sk, skb, trim_cap))
 554                 goto discard_and_relse;
 555
 556         skb->dev = NULL;
 557
 558         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 559                 atomic_inc(&sk->sk_drops);
 560                 goto discard_and_relse;
 561         }
 562         if (nested)
 563                 bh_lock_sock_nested(sk);
 564         else
 565                 bh_lock_sock(sk);
 566         if (!sock_owned_by_user(sk)) {
 567                 /*
 568                  * trylock + unlock semantics:
 569                  */
 570                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 571
 572                 rc = sk_backlog_rcv(sk, skb);
 573
 574                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 575         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 576                 bh_unlock_sock(sk);
 577                 atomic_inc(&sk->sk_drops);
 578                 goto discard_and_relse;
 579         }
 580
 581         bh_unlock_sock(sk);
 582 out:
 583         if (refcounted)
 584                 sock_put(sk);
 585         return rc;
 586 discard_and_relse:
 587         kfree_skb(skb);
 588         goto out;
 589 }
 590 EXPORT_SYMBOL(__sk_receive_skb);
 591
 592 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 593                                                           u32));
 594 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 595                                                            u32));
 596 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 597 {
 598         struct dst_entry *dst = __sk_dst_get(sk);
 599
 600         if (dst && dst->obsolete &&
 601             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 602                                dst, cookie) == NULL) {
 603                 sk_tx_queue_clear(sk);
 604                 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 605                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 606                 dst_release(dst);
 607                 return NULL;
 608         }
 609
 610         return dst;
 611 }
 612 EXPORT_SYMBOL(__sk_dst_check);
 613
 614 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 615 {
 616         struct dst_entry *dst = sk_dst_get(sk);
 617
 618         if (dst && dst->obsolete &&
 619             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 620                                dst, cookie) == NULL) {
 621                 sk_dst_reset(sk);
 622                 dst_release(dst);
 623                 return NULL;
 624         }
 625
 626         return dst;
 627 }
 628 EXPORT_SYMBOL(sk_dst_check);
 629
 630 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 631 {
 632         int ret = -ENOPROTOOPT;
 633 #ifdef CONFIG_NETDEVICES
 634         struct net *net = sock_net(sk);
 635
 636         /* Sorry... */
 637         ret = -EPERM;
 638         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 639                 goto out;
 640
 641         ret = -EINVAL;
 642         if (ifindex < 0)
 643                 goto out;
 644
 645         /* Paired with all READ_ONCE() done locklessly. */
 646         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 647
 648         if (sk->sk_prot->rehash)
 649                 sk->sk_prot->rehash(sk);
 650         sk_dst_reset(sk);
 651
 652         ret = 0;
 653
 654 out:
 655 #endif
 656
 657         return ret;
 658 }
 659
 660 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 661 {
 662         int ret;
 663
 664         if (lock_sk)
 665                 lock_sock(sk);
 666         ret = sock_bindtoindex_locked(sk, ifindex);
 667         if (lock_sk)
 668                 release_sock(sk);
 669
 670         return ret;
 671 }
 672 EXPORT_SYMBOL(sock_bindtoindex);
 673
 674 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 675 {
 676         int ret = -ENOPROTOOPT;
 677 #ifdef CONFIG_NETDEVICES
 678         struct net *net = sock_net(sk);
 679         char devname[IFNAMSIZ];
 680         int index;
 681
 682         ret = -EINVAL;
 683         if (optlen < 0)
 684                 goto out;
 685
 686         /* Bind this socket to a particular device like "eth0",
 687          * as specified in the passed interface name. If the
 688          * name is "" or the option length is zero the socket
 689          * is not bound.
 690          */
 691         if (optlen > IFNAMSIZ - 1)
 692                 optlen = IFNAMSIZ - 1;
 693         memset(devname, 0, sizeof(devname));
 694
 695         ret = -EFAULT;
 696         if (copy_from_sockptr(devname, optval, optlen))
 697                 goto out;
 698
 699         index = 0;
 700         if (devname[0] != '\0') {
 701                 struct net_device *dev;
 702
 703                 rcu_read_lock();
 704                 dev = dev_get_by_name_rcu(net, devname);
 705                 if (dev)
 706                         index = dev->ifindex;
 707                 rcu_read_unlock();
 708                 ret = -ENODEV;
 709                 if (!dev)
 710                         goto out;
 711         }
 712
 713         sockopt_lock_sock(sk);
 714         ret = sock_bindtoindex_locked(sk, index);
 715         sockopt_release_sock(sk);
 716 out:
 717 #endif
 718
 719         return ret;
 720 }
 721
 722 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 723                                 sockptr_t optlen, int len)
 724 {
 725         int ret = -ENOPROTOOPT;
 726 #ifdef CONFIG_NETDEVICES
 727         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 728         struct net *net = sock_net(sk);
 729         char devname[IFNAMSIZ];
 730
 731         if (bound_dev_if == 0) {
 732                 len = 0;
 733                 goto zero;
 734         }
 735
 736         ret = -EINVAL;
 737         if (len < IFNAMSIZ)
 738                 goto out;
 739
 740         ret = netdev_get_name(net, devname, bound_dev_if);
 741         if (ret)
 742                 goto out;
 743
 744         len = strlen(devname) + 1;
 745
 746         ret = -EFAULT;
 747         if (copy_to_sockptr(optval, devname, len))
 748                 goto out;
 749
 750 zero:
 751         ret = -EFAULT;
 752         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 753                 goto out;
 754
 755         ret = 0;
 756
 757 out:
 758 #endif
 759
 760         return ret;
 761 }
 762
 763 bool sk_mc_loop(struct sock *sk)
 764 {
 765         if (dev_recursion_level())
 766                 return false;
 767         if (!sk)
 768                 return true;
 769         /* IPV6_ADDRFORM can change sk->sk_family under us. */
 770         switch (READ_ONCE(sk->sk_family)) {
 771         case AF_INET:
 772                 return inet_test_bit(MC_LOOP, sk);
 773 #if IS_ENABLED(CONFIG_IPV6)
 774         case AF_INET6:
 775                 return inet6_sk(sk)->mc_loop;
 776 #endif
 777         }
 778         WARN_ON_ONCE(1);
 779         return true;
 780 }
 781 EXPORT_SYMBOL(sk_mc_loop);
 782
 783 void sock_set_reuseaddr(struct sock *sk)
 784 {
 785         lock_sock(sk);
 786         sk->sk_reuse = SK_CAN_REUSE;
 787         release_sock(sk);
 788 }
 789 EXPORT_SYMBOL(sock_set_reuseaddr);
 790
 791 void sock_set_reuseport(struct sock *sk)
 792 {
 793         lock_sock(sk);
 794         sk->sk_reuseport = true;
 795         release_sock(sk);
 796 }
 797 EXPORT_SYMBOL(sock_set_reuseport);
 798
 799 void sock_no_linger(struct sock *sk)
 800 {
 801         lock_sock(sk);
 802         WRITE_ONCE(sk->sk_lingertime, 0);
 803         sock_set_flag(sk, SOCK_LINGER);
 804         release_sock(sk);
 805 }
 806 EXPORT_SYMBOL(sock_no_linger);
 807
 808 void sock_set_priority(struct sock *sk, u32 priority)
 809 {
 810         lock_sock(sk);
 811         WRITE_ONCE(sk->sk_priority, priority);
 812         release_sock(sk);
 813 }
 814 EXPORT_SYMBOL(sock_set_priority);
 815
 816 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 817 {
 818         lock_sock(sk);
 819         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 820                 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 821         else
 822                 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 823         release_sock(sk);
 824 }
 825 EXPORT_SYMBOL(sock_set_sndtimeo);
 826
 827 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 828 {
 829         if (val)  {
 830                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 831                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 832                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 833                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 834         } else {
 835                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 836                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 837         }
 838 }
 839
 840 void sock_enable_timestamps(struct sock *sk)
 841 {
 842         lock_sock(sk);
 843         __sock_set_timestamps(sk, true, false, true);
 844         release_sock(sk);
 845 }
 846 EXPORT_SYMBOL(sock_enable_timestamps);
 847
 848 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 849 {
 850         switch (optname) {
 851         case SO_TIMESTAMP_OLD:
 852                 __sock_set_timestamps(sk, valbool, false, false);
 853                 break;
 854         case SO_TIMESTAMP_NEW:
 855                 __sock_set_timestamps(sk, valbool, true, false);
 856                 break;
 857         case SO_TIMESTAMPNS_OLD:
 858                 __sock_set_timestamps(sk, valbool, false, true);
 859                 break;
 860         case SO_TIMESTAMPNS_NEW:
 861                 __sock_set_timestamps(sk, valbool, true, true);
 862                 break;
 863         }
 864 }
 865
 866 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 867 {
 868         struct net *net = sock_net(sk);
 869         struct net_device *dev = NULL;
 870         bool match = false;
 871         int *vclock_index;
 872         int i, num;
 873
 874         if (sk->sk_bound_dev_if)
 875                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 876
 877         if (!dev) {
 878                 pr_err("%s: sock not bind to device\n", __func__);
 879                 return -EOPNOTSUPP;
 880         }
 881
 882         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 883         dev_put(dev);
 884
 885         for (i = 0; i < num; i++) {
 886                 if (*(vclock_index + i) == phc_index) {
 887                         match = true;
 888                         break;
 889                 }
 890         }
 891
 892         if (num > 0)
 893                 kfree(vclock_index);
 894
 895         if (!match)
 896                 return -EINVAL;
 897
 898         WRITE_ONCE(sk->sk_bind_phc, phc_index);
 899
 900         return 0;
 901 }
 902
 903 int sock_set_timestamping(struct sock *sk, int optname,
 904                           struct so_timestamping timestamping)
 905 {
 906         int val = timestamping.flags;
 907         int ret;
 908
 909         if (val & ~SOF_TIMESTAMPING_MASK)
 910                 return -EINVAL;
 911
 912         if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 913             !(val & SOF_TIMESTAMPING_OPT_ID))
 914                 return -EINVAL;
 915
 916         if (val & SOF_TIMESTAMPING_OPT_ID &&
 917             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 918                 if (sk_is_tcp(sk)) {
 919                         if ((1 << sk->sk_state) &
 920                             (TCPF_CLOSE | TCPF_LISTEN))
 921                                 return -EINVAL;
 922                         if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 923                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 924                         else
 925                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 926                 } else {
 927                         atomic_set(&sk->sk_tskey, 0);
 928                 }
 929         }
 930
 931         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 932             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 933                 return -EINVAL;
 934
 935         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 936                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 937                 if (ret)
 938                         return ret;
 939         }
 940
 941         WRITE_ONCE(sk->sk_tsflags, val);
 942         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 943
 944         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 945                 sock_enable_timestamp(sk,
 946                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 947         else
 948                 sock_disable_timestamp(sk,
 949                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 950         return 0;
 951 }
 952
 953 void sock_set_keepalive(struct sock *sk)
 954 {
 955         lock_sock(sk);
 956         if (sk->sk_prot->keepalive)
 957                 sk->sk_prot->keepalive(sk, true);
 958         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 959         release_sock(sk);
 960 }
 961 EXPORT_SYMBOL(sock_set_keepalive);
 962
 963 static void __sock_set_rcvbuf(struct sock *sk, int val)
 964 {
 965         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 966          * as a negative value.
 967          */
 968         val = min_t(int, val, INT_MAX / 2);
 969         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 970
 971         /* We double it on the way in to account for "struct sk_buff" etc.
 972          * overhead.   Applications assume that the SO_RCVBUF setting they make
 973          * will allow that much actual data to be received on that socket.
 974          *
 975          * Applications are unaware that "struct sk_buff" and other overheads
 976          * allocate from the receive buffer during socket buffer allocation.
 977          *
 978          * And after considering the possible alternatives, returning the value
 979          * we actually used in getsockopt is the most desirable behavior.
 980          */
 981         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 982 }
 983
 984 void sock_set_rcvbuf(struct sock *sk, int val)
 985 {
 986         lock_sock(sk);
 987         __sock_set_rcvbuf(sk, val);
 988         release_sock(sk);
 989 }
 990 EXPORT_SYMBOL(sock_set_rcvbuf);
 991
 992 static void __sock_set_mark(struct sock *sk, u32 val)
 993 {
 994         if (val != sk->sk_mark) {
 995                 WRITE_ONCE(sk->sk_mark, val);
 996                 sk_dst_reset(sk);
 997         }
 998 }
 999
1000 void sock_set_mark(struct sock *sk, u32 val)
1001 {
1002         lock_sock(sk);
1003         __sock_set_mark(sk, val);
1004         release_sock(sk);
1005 }
1006 EXPORT_SYMBOL(sock_set_mark);
1007
1008 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1009 {
1010         /* Round down bytes to multiple of pages */
1011         bytes = round_down(bytes, PAGE_SIZE);
1012
1013         WARN_ON(bytes > sk->sk_reserved_mem);
1014         WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1015         sk_mem_reclaim(sk);
1016 }
1017
1018 static int sock_reserve_memory(struct sock *sk, int bytes)
1019 {
1020         long allocated;
1021         bool charged;
1022         int pages;
1023
1024         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1025                 return -EOPNOTSUPP;
1026
1027         if (!bytes)
1028                 return 0;
1029
1030         pages = sk_mem_pages(bytes);
1031
1032         /* pre-charge to memcg */
1033         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1034                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1035         if (!charged)
1036                 return -ENOMEM;
1037
1038         /* pre-charge to forward_alloc */
1039         sk_memory_allocated_add(sk, pages);
1040         allocated = sk_memory_allocated(sk);
1041         /* If the system goes into memory pressure with this
1042          * precharge, give up and return error.
1043          */
1044         if (allocated > sk_prot_mem_limits(sk, 1)) {
1045                 sk_memory_allocated_sub(sk, pages);
1046                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1047                 return -ENOMEM;
1048         }
1049         sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1050
1051         WRITE_ONCE(sk->sk_reserved_mem,
1052                    sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1053
1054         return 0;
1055 }
1056
1057 void sockopt_lock_sock(struct sock *sk)
1058 {
1059         /* When current->bpf_ctx is set, the setsockopt is called from
1060          * a bpf prog.  bpf has ensured the sk lock has been
1061          * acquired before calling setsockopt().
1062          */
1063         if (has_current_bpf_ctx())
1064                 return;
1065
1066         lock_sock(sk);
1067 }
1068 EXPORT_SYMBOL(sockopt_lock_sock);
1069
1070 void sockopt_release_sock(struct sock *sk)
1071 {
1072         if (has_current_bpf_ctx())
1073                 return;
1074
1075         release_sock(sk);
1076 }
1077 EXPORT_SYMBOL(sockopt_release_sock);
1078
1079 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1080 {
1081         return has_current_bpf_ctx() || ns_capable(ns, cap);
1082 }
1083 EXPORT_SYMBOL(sockopt_ns_capable);
1084
1085 bool sockopt_capable(int cap)
1086 {
1087         return has_current_bpf_ctx() || capable(cap);
1088 }
1089 EXPORT_SYMBOL(sockopt_capable);
1090
1091 /*
1092  *      This is meant for all protocols to use and covers goings on
1093  *      at the socket level. Everything here is generic.
1094  */
1095
1096 int sk_setsockopt(struct sock *sk, int level, int optname,
1097                   sockptr_t optval, unsigned int optlen)
1098 {
1099         struct so_timestamping timestamping;
1100         struct socket *sock = sk->sk_socket;
1101         struct sock_txtime sk_txtime;
1102         int val;
1103         int valbool;
1104         struct linger ling;
1105         int ret = 0;
1106
1107         /*
1108          *      Options without arguments
1109          */
1110
1111         if (optname == SO_BINDTODEVICE)
1112                 return sock_setbindtodevice(sk, optval, optlen);
1113
1114         if (optlen < sizeof(int))
1115                 return -EINVAL;
1116
1117         if (copy_from_sockptr(&val, optval, sizeof(val)))
1118                 return -EFAULT;
1119
1120         valbool = val ? 1 : 0;
1121
1122         sockopt_lock_sock(sk);
1123
1124         switch (optname) {
1125         case SO_DEBUG:
1126                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1127                         ret = -EACCES;
1128                 else
1129                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1130                 break;
1131         case SO_REUSEADDR:
1132                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1133                 break;
1134         case SO_REUSEPORT:
1135                 sk->sk_reuseport = valbool;
1136                 break;
1137         case SO_TYPE:
1138         case SO_PROTOCOL:
1139         case SO_DOMAIN:
1140         case SO_ERROR:
1141                 ret = -ENOPROTOOPT;
1142                 break;
1143         case SO_DONTROUTE:
1144                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1145                 sk_dst_reset(sk);
1146                 break;
1147         case SO_BROADCAST:
1148                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1149                 break;
1150         case SO_SNDBUF:
1151                 /* Don't error on this BSD doesn't and if you think
1152                  * about it this is right. Otherwise apps have to
1153                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1154                  * are treated in BSD as hints
1155                  */
1156                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1157 set_sndbuf:
1158                 /* Ensure val * 2 fits into an int, to prevent max_t()
1159                  * from treating it as a negative value.
1160                  */
1161                 val = min_t(int, val, INT_MAX / 2);
1162                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1163                 WRITE_ONCE(sk->sk_sndbuf,
1164                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1165                 /* Wake up sending tasks if we upped the value. */
1166                 sk->sk_write_space(sk);
1167                 break;
1168
1169         case SO_SNDBUFFORCE:
1170                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1171                         ret = -EPERM;
1172                         break;
1173                 }
1174
1175                 /* No negative values (to prevent underflow, as val will be
1176                  * multiplied by 2).
1177                  */
1178                 if (val < 0)
1179                         val = 0;
1180                 goto set_sndbuf;
1181
1182         case SO_RCVBUF:
1183                 /* Don't error on this BSD doesn't and if you think
1184                  * about it this is right. Otherwise apps have to
1185                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1186                  * are treated in BSD as hints
1187                  */
1188                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1189                 break;
1190
1191         case SO_RCVBUFFORCE:
1192                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1193                         ret = -EPERM;
1194                         break;
1195                 }
1196
1197                 /* No negative values (to prevent underflow, as val will be
1198                  * multiplied by 2).
1199                  */
1200                 __sock_set_rcvbuf(sk, max(val, 0));
1201                 break;
1202
1203         case SO_KEEPALIVE:
1204                 if (sk->sk_prot->keepalive)
1205                         sk->sk_prot->keepalive(sk, valbool);
1206                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1207                 break;
1208
1209         case SO_OOBINLINE:
1210                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1211                 break;
1212
1213         case SO_NO_CHECK:
1214                 sk->sk_no_check_tx = valbool;
1215                 break;
1216
1217         case SO_PRIORITY:
1218                 if ((val >= 0 && val <= 6) ||
1219                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1220                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1221                         WRITE_ONCE(sk->sk_priority, val);
1222                 else
1223                         ret = -EPERM;
1224                 break;
1225
1226         case SO_LINGER:
1227                 if (optlen < sizeof(ling)) {
1228                         ret = -EINVAL;  /* 1003.1g */
1229                         break;
1230                 }
1231                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1232                         ret = -EFAULT;
1233                         break;
1234                 }
1235                 if (!ling.l_onoff) {
1236                         sock_reset_flag(sk, SOCK_LINGER);
1237                 } else {
1238                         unsigned long t_sec = ling.l_linger;
1239
1240                         if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1241                                 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1242                         else
1243                                 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1244                         sock_set_flag(sk, SOCK_LINGER);
1245                 }
1246                 break;
1247
1248         case SO_BSDCOMPAT:
1249                 break;
1250
1251         case SO_PASSCRED:
1252                 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1253                 break;
1254
1255         case SO_PASSPIDFD:
1256                 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1257                 break;
1258
1259         case SO_TIMESTAMP_OLD:
1260         case SO_TIMESTAMP_NEW:
1261         case SO_TIMESTAMPNS_OLD:
1262         case SO_TIMESTAMPNS_NEW:
1263                 sock_set_timestamp(sk, optname, valbool);
1264                 break;
1265
1266         case SO_TIMESTAMPING_NEW:
1267         case SO_TIMESTAMPING_OLD:
1268                 if (optlen == sizeof(timestamping)) {
1269                         if (copy_from_sockptr(&timestamping, optval,
1270                                               sizeof(timestamping))) {
1271                                 ret = -EFAULT;
1272                                 break;
1273                         }
1274                 } else {
1275                         memset(&timestamping, 0, sizeof(timestamping));
1276                         timestamping.flags = val;
1277                 }
1278                 ret = sock_set_timestamping(sk, optname, timestamping);
1279                 break;
1280
1281         case SO_RCVLOWAT:
1282                 {
1283                 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1284
1285                 if (val < 0)
1286                         val = INT_MAX;
1287                 if (sock)
1288                         set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1289                 if (set_rcvlowat)
1290                         ret = set_rcvlowat(sk, val);
1291                 else
1292                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1293                 break;
1294                 }
1295         case SO_RCVTIMEO_OLD:
1296         case SO_RCVTIMEO_NEW:
1297                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1298                                        optlen, optname == SO_RCVTIMEO_OLD);
1299                 break;
1300
1301         case SO_SNDTIMEO_OLD:
1302         case SO_SNDTIMEO_NEW:
1303                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1304                                        optlen, optname == SO_SNDTIMEO_OLD);
1305                 break;
1306
1307         case SO_ATTACH_FILTER: {
1308                 struct sock_fprog fprog;
1309
1310                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1311                 if (!ret)
1312                         ret = sk_attach_filter(&fprog, sk);
1313                 break;
1314         }
1315         case SO_ATTACH_BPF:
1316                 ret = -EINVAL;
1317                 if (optlen == sizeof(u32)) {
1318                         u32 ufd;
1319
1320                         ret = -EFAULT;
1321                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1322                                 break;
1323
1324                         ret = sk_attach_bpf(ufd, sk);
1325                 }
1326                 break;
1327
1328         case SO_ATTACH_REUSEPORT_CBPF: {
1329                 struct sock_fprog fprog;
1330
1331                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1332                 if (!ret)
1333                         ret = sk_reuseport_attach_filter(&fprog, sk);
1334                 break;
1335         }
1336         case SO_ATTACH_REUSEPORT_EBPF:
1337                 ret = -EINVAL;
1338                 if (optlen == sizeof(u32)) {
1339                         u32 ufd;
1340
1341                         ret = -EFAULT;
1342                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1343                                 break;
1344
1345                         ret = sk_reuseport_attach_bpf(ufd, sk);
1346                 }
1347                 break;
1348
1349         case SO_DETACH_REUSEPORT_BPF:
1350                 ret = reuseport_detach_prog(sk);
1351                 break;
1352
1353         case SO_DETACH_FILTER:
1354                 ret = sk_detach_filter(sk);
1355                 break;
1356
1357         case SO_LOCK_FILTER:
1358                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1359                         ret = -EPERM;
1360                 else
1361                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1362                 break;
1363
1364         case SO_PASSSEC:
1365                 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1366                 break;
1367         case SO_MARK:
1368                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1369                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1370                         ret = -EPERM;
1371                         break;
1372                 }
1373
1374                 __sock_set_mark(sk, val);
1375                 break;
1376         case SO_RCVMARK:
1377                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1378                 break;
1379
1380         case SO_RXQ_OVFL:
1381                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1382                 break;
1383
1384         case SO_WIFI_STATUS:
1385                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1386                 break;
1387
1388         case SO_PEEK_OFF:
1389                 {
1390                 int (*set_peek_off)(struct sock *sk, int val);
1391
1392                 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1393                 if (set_peek_off)
1394                         ret = set_peek_off(sk, val);
1395                 else
1396                         ret = -EOPNOTSUPP;
1397                 break;
1398                 }
1399
1400         case SO_NOFCS:
1401                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1402                 break;
1403
1404         case SO_SELECT_ERR_QUEUE:
1405                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1406                 break;
1407
1408 #ifdef CONFIG_NET_RX_BUSY_POLL
1409         case SO_BUSY_POLL:
1410                 if (val < 0)
1411                         ret = -EINVAL;
1412                 else
1413                         WRITE_ONCE(sk->sk_ll_usec, val);
1414                 break;
1415         case SO_PREFER_BUSY_POLL:
1416                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1417                         ret = -EPERM;
1418                 else
1419                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1420                 break;
1421         case SO_BUSY_POLL_BUDGET:
1422                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1423                         ret = -EPERM;
1424                 } else {
1425                         if (val < 0 || val > U16_MAX)
1426                                 ret = -EINVAL;
1427                         else
1428                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1429                 }
1430                 break;
1431 #endif
1432
1433         case SO_MAX_PACING_RATE:
1434                 {
1435                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1436
1437                 if (sizeof(ulval) != sizeof(val) &&
1438                     optlen >= sizeof(ulval) &&
1439                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1440                         ret = -EFAULT;
1441                         break;
1442                 }
1443                 if (ulval != ~0UL)
1444                         cmpxchg(&sk->sk_pacing_status,
1445                                 SK_PACING_NONE,
1446                                 SK_PACING_NEEDED);
1447                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1448                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1449                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1450                 break;
1451                 }
1452         case SO_INCOMING_CPU:
1453                 reuseport_update_incoming_cpu(sk, val);
1454                 break;
1455
1456         case SO_CNX_ADVICE:
1457                 if (val == 1)
1458                         dst_negative_advice(sk);
1459                 break;
1460
1461         case SO_ZEROCOPY:
1462                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1463                         if (!(sk_is_tcp(sk) ||
1464                               (sk->sk_type == SOCK_DGRAM &&
1465                                sk->sk_protocol == IPPROTO_UDP)))
1466                                 ret = -EOPNOTSUPP;
1467                 } else if (sk->sk_family != PF_RDS) {
1468                         ret = -EOPNOTSUPP;
1469                 }
1470                 if (!ret) {
1471                         if (val < 0 || val > 1)
1472                                 ret = -EINVAL;
1473                         else
1474                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1475                 }
1476                 break;
1477
1478         case SO_TXTIME:
1479                 if (optlen != sizeof(struct sock_txtime)) {
1480                         ret = -EINVAL;
1481                         break;
1482                 } else if (copy_from_sockptr(&sk_txtime, optval,
1483                            sizeof(struct sock_txtime))) {
1484                         ret = -EFAULT;
1485                         break;
1486                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1487                         ret = -EINVAL;
1488                         break;
1489                 }
1490                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1491                  * scheduler has enough safe guards.
1492                  */
1493                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1494                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1495                         ret = -EPERM;
1496                         break;
1497                 }
1498                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1499                 sk->sk_clockid = sk_txtime.clockid;
1500                 sk->sk_txtime_deadline_mode =
1501                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1502                 sk->sk_txtime_report_errors =
1503                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1504                 break;
1505
1506         case SO_BINDTOIFINDEX:
1507                 ret = sock_bindtoindex_locked(sk, val);
1508                 break;
1509
1510         case SO_BUF_LOCK:
1511                 if (val & ~SOCK_BUF_LOCK_MASK) {
1512                         ret = -EINVAL;
1513                         break;
1514                 }
1515                 sk->sk_userlocks = val | (sk->sk_userlocks &
1516                                           ~SOCK_BUF_LOCK_MASK);
1517                 break;
1518
1519         case SO_RESERVE_MEM:
1520         {
1521                 int delta;
1522
1523                 if (val < 0) {
1524                         ret = -EINVAL;
1525                         break;
1526                 }
1527
1528                 delta = val - sk->sk_reserved_mem;
1529                 if (delta < 0)
1530                         sock_release_reserved_memory(sk, -delta);
1531                 else
1532                         ret = sock_reserve_memory(sk, delta);
1533                 break;
1534         }
1535
1536         case SO_TXREHASH:
1537                 if (val < -1 || val > 1) {
1538                         ret = -EINVAL;
1539                         break;
1540                 }
1541                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1542                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1543                 /* Paired with READ_ONCE() in tcp_rtx_synack()
1544                  * and sk_getsockopt().
1545                  */
1546                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1547                 break;
1548
1549         default:
1550                 ret = -ENOPROTOOPT;
1551                 break;
1552         }
1553         sockopt_release_sock(sk);
1554         return ret;
1555 }
1556
1557 int sock_setsockopt(struct socket *sock, int level, int optname,
1558                     sockptr_t optval, unsigned int optlen)
1559 {
1560         return sk_setsockopt(sock->sk, level, optname,
1561                              optval, optlen);
1562 }
1563 EXPORT_SYMBOL(sock_setsockopt);
1564
1565 static const struct cred *sk_get_peer_cred(struct sock *sk)
1566 {
1567         const struct cred *cred;
1568
1569         spin_lock(&sk->sk_peer_lock);
1570         cred = get_cred(sk->sk_peer_cred);
1571         spin_unlock(&sk->sk_peer_lock);
1572
1573         return cred;
1574 }
1575
1576 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1577                           struct ucred *ucred)
1578 {
1579         ucred->pid = pid_vnr(pid);
1580         ucred->uid = ucred->gid = -1;
1581         if (cred) {
1582                 struct user_namespace *current_ns = current_user_ns();
1583
1584                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1585                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1586         }
1587 }
1588
1589 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1590 {
1591         struct user_namespace *user_ns = current_user_ns();
1592         int i;
1593
1594         for (i = 0; i < src->ngroups; i++) {
1595                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1596
1597                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1598                         return -EFAULT;
1599         }
1600
1601         return 0;
1602 }
1603
1604 int sk_getsockopt(struct sock *sk, int level, int optname,
1605                   sockptr_t optval, sockptr_t optlen)
1606 {
1607         struct socket *sock = sk->sk_socket;
1608
1609         union {
1610                 int val;
1611                 u64 val64;
1612                 unsigned long ulval;
1613                 struct linger ling;
1614                 struct old_timeval32 tm32;
1615                 struct __kernel_old_timeval tm;
1616                 struct  __kernel_sock_timeval stm;
1617                 struct sock_txtime txtime;
1618                 struct so_timestamping timestamping;
1619         } v;
1620
1621         int lv = sizeof(int);
1622         int len;
1623
1624         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1625                 return -EFAULT;
1626         if (len < 0)
1627                 return -EINVAL;
1628
1629         memset(&v, 0, sizeof(v));
1630
1631         switch (optname) {
1632         case SO_DEBUG:
1633                 v.val = sock_flag(sk, SOCK_DBG);
1634                 break;
1635
1636         case SO_DONTROUTE:
1637                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1638                 break;
1639
1640         case SO_BROADCAST:
1641                 v.val = sock_flag(sk, SOCK_BROADCAST);
1642                 break;
1643
1644         case SO_SNDBUF:
1645                 v.val = READ_ONCE(sk->sk_sndbuf);
1646                 break;
1647
1648         case SO_RCVBUF:
1649                 v.val = READ_ONCE(sk->sk_rcvbuf);
1650                 break;
1651
1652         case SO_REUSEADDR:
1653                 v.val = sk->sk_reuse;
1654                 break;
1655
1656         case SO_REUSEPORT:
1657                 v.val = sk->sk_reuseport;
1658                 break;
1659
1660         case SO_KEEPALIVE:
1661                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1662                 break;
1663
1664         case SO_TYPE:
1665                 v.val = sk->sk_type;
1666                 break;
1667
1668         case SO_PROTOCOL:
1669                 v.val = sk->sk_protocol;
1670                 break;
1671
1672         case SO_DOMAIN:
1673                 v.val = sk->sk_family;
1674                 break;
1675
1676         case SO_ERROR:
1677                 v.val = -sock_error(sk);
1678                 if (v.val == 0)
1679                         v.val = xchg(&sk->sk_err_soft, 0);
1680                 break;
1681
1682         case SO_OOBINLINE:
1683                 v.val = sock_flag(sk, SOCK_URGINLINE);
1684                 break;
1685
1686         case SO_NO_CHECK:
1687                 v.val = sk->sk_no_check_tx;
1688                 break;
1689
1690         case SO_PRIORITY:
1691                 v.val = READ_ONCE(sk->sk_priority);
1692                 break;
1693
1694         case SO_LINGER:
1695                 lv              = sizeof(v.ling);
1696                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1697                 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1698                 break;
1699
1700         case SO_BSDCOMPAT:
1701                 break;
1702
1703         case SO_TIMESTAMP_OLD:
1704                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1705                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1706                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1707                 break;
1708
1709         case SO_TIMESTAMPNS_OLD:
1710                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1711                 break;
1712
1713         case SO_TIMESTAMP_NEW:
1714                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1715                 break;
1716
1717         case SO_TIMESTAMPNS_NEW:
1718                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1719                 break;
1720
1721         case SO_TIMESTAMPING_OLD:
1722         case SO_TIMESTAMPING_NEW:
1723                 lv = sizeof(v.timestamping);
1724                 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1725                  * returning the flags when they were set through the same option.
1726                  * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1727                  */
1728                 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1729                         v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1730                         v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1731                 }
1732                 break;
1733
1734         case SO_RCVTIMEO_OLD:
1735         case SO_RCVTIMEO_NEW:
1736                 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1737                                       SO_RCVTIMEO_OLD == optname);
1738                 break;
1739
1740         case SO_SNDTIMEO_OLD:
1741         case SO_SNDTIMEO_NEW:
1742                 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1743                                       SO_SNDTIMEO_OLD == optname);
1744                 break;
1745
1746         case SO_RCVLOWAT:
1747                 v.val = READ_ONCE(sk->sk_rcvlowat);
1748                 break;
1749
1750         case SO_SNDLOWAT:
1751                 v.val = 1;
1752                 break;
1753
1754         case SO_PASSCRED:
1755                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1756                 break;
1757
1758         case SO_PASSPIDFD:
1759                 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1760                 break;
1761
1762         case SO_PEERCRED:
1763         {
1764                 struct ucred peercred;
1765                 if (len > sizeof(peercred))
1766                         len = sizeof(peercred);
1767
1768                 spin_lock(&sk->sk_peer_lock);
1769                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1770                 spin_unlock(&sk->sk_peer_lock);
1771
1772                 if (copy_to_sockptr(optval, &peercred, len))
1773                         return -EFAULT;
1774                 goto lenout;
1775         }
1776
1777         case SO_PEERPIDFD:
1778         {
1779                 struct pid *peer_pid;
1780                 struct file *pidfd_file = NULL;
1781                 int pidfd;
1782
1783                 if (len > sizeof(pidfd))
1784                         len = sizeof(pidfd);
1785
1786                 spin_lock(&sk->sk_peer_lock);
1787                 peer_pid = get_pid(sk->sk_peer_pid);
1788                 spin_unlock(&sk->sk_peer_lock);
1789
1790                 if (!peer_pid)
1791                         return -ENODATA;
1792
1793                 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1794                 put_pid(peer_pid);
1795                 if (pidfd < 0)
1796                         return pidfd;
1797
1798                 if (copy_to_sockptr(optval, &pidfd, len) ||
1799                     copy_to_sockptr(optlen, &len, sizeof(int))) {
1800                         put_unused_fd(pidfd);
1801                         fput(pidfd_file);
1802
1803                         return -EFAULT;
1804                 }
1805
1806                 fd_install(pidfd, pidfd_file);
1807                 return 0;
1808         }
1809
1810         case SO_PEERGROUPS:
1811         {
1812                 const struct cred *cred;
1813                 int ret, n;
1814
1815                 cred = sk_get_peer_cred(sk);
1816                 if (!cred)
1817                         return -ENODATA;
1818
1819                 n = cred->group_info->ngroups;
1820                 if (len < n * sizeof(gid_t)) {
1821                         len = n * sizeof(gid_t);
1822                         put_cred(cred);
1823                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1824                 }
1825                 len = n * sizeof(gid_t);
1826
1827                 ret = groups_to_user(optval, cred->group_info);
1828                 put_cred(cred);
1829                 if (ret)
1830                         return ret;
1831                 goto lenout;
1832         }
1833
1834         case SO_PEERNAME:
1835         {
1836                 struct sockaddr_storage address;
1837
1838                 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1839                 if (lv < 0)
1840                         return -ENOTCONN;
1841                 if (lv < len)
1842                         return -EINVAL;
1843                 if (copy_to_sockptr(optval, &address, len))
1844                         return -EFAULT;
1845                 goto lenout;
1846         }
1847
1848         /* Dubious BSD thing... Probably nobody even uses it, but
1849          * the UNIX standard wants it for whatever reason... -DaveM
1850          */
1851         case SO_ACCEPTCONN:
1852                 v.val = sk->sk_state == TCP_LISTEN;
1853                 break;
1854
1855         case SO_PASSSEC:
1856                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1857                 break;
1858
1859         case SO_PEERSEC:
1860                 return security_socket_getpeersec_stream(sock,
1861                                                          optval, optlen, len);
1862
1863         case SO_MARK:
1864                 v.val = READ_ONCE(sk->sk_mark);
1865                 break;
1866
1867         case SO_RCVMARK:
1868                 v.val = sock_flag(sk, SOCK_RCVMARK);
1869                 break;
1870
1871         case SO_RXQ_OVFL:
1872                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1873                 break;
1874
1875         case SO_WIFI_STATUS:
1876                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1877                 break;
1878
1879         case SO_PEEK_OFF:
1880                 if (!READ_ONCE(sock->ops)->set_peek_off)
1881                         return -EOPNOTSUPP;
1882
1883                 v.val = READ_ONCE(sk->sk_peek_off);
1884                 break;
1885         case SO_NOFCS:
1886                 v.val = sock_flag(sk, SOCK_NOFCS);
1887                 break;
1888
1889         case SO_BINDTODEVICE:
1890                 return sock_getbindtodevice(sk, optval, optlen, len);
1891
1892         case SO_GET_FILTER:
1893                 len = sk_get_filter(sk, optval, len);
1894                 if (len < 0)
1895                         return len;
1896
1897                 goto lenout;
1898
1899         case SO_LOCK_FILTER:
1900                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1901                 break;
1902
1903         case SO_BPF_EXTENSIONS:
1904                 v.val = bpf_tell_extensions();
1905                 break;
1906
1907         case SO_SELECT_ERR_QUEUE:
1908                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1909                 break;
1910
1911 #ifdef CONFIG_NET_RX_BUSY_POLL
1912         case SO_BUSY_POLL:
1913                 v.val = READ_ONCE(sk->sk_ll_usec);
1914                 break;
1915         case SO_PREFER_BUSY_POLL:
1916                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1917                 break;
1918 #endif
1919
1920         case SO_MAX_PACING_RATE:
1921                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1922                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1923                         lv = sizeof(v.ulval);
1924                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1925                 } else {
1926                         /* 32bit version */
1927                         v.val = min_t(unsigned long, ~0U,
1928                                       READ_ONCE(sk->sk_max_pacing_rate));
1929                 }
1930                 break;
1931
1932         case SO_INCOMING_CPU:
1933                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1934                 break;
1935
1936         case SO_MEMINFO:
1937         {
1938                 u32 meminfo[SK_MEMINFO_VARS];
1939
1940                 sk_get_meminfo(sk, meminfo);
1941
1942                 len = min_t(unsigned int, len, sizeof(meminfo));
1943                 if (copy_to_sockptr(optval, &meminfo, len))
1944                         return -EFAULT;
1945
1946                 goto lenout;
1947         }
1948
1949 #ifdef CONFIG_NET_RX_BUSY_POLL
1950         case SO_INCOMING_NAPI_ID:
1951                 v.val = READ_ONCE(sk->sk_napi_id);
1952
1953                 /* aggregate non-NAPI IDs down to 0 */
1954                 if (v.val < MIN_NAPI_ID)
1955                         v.val = 0;
1956
1957                 break;
1958 #endif
1959
1960         case SO_COOKIE:
1961                 lv = sizeof(u64);
1962                 if (len < lv)
1963                         return -EINVAL;
1964                 v.val64 = sock_gen_cookie(sk);
1965                 break;
1966
1967         case SO_ZEROCOPY:
1968                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1969                 break;
1970
1971         case SO_TXTIME:
1972                 lv = sizeof(v.txtime);
1973                 v.txtime.clockid = sk->sk_clockid;
1974                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1975                                   SOF_TXTIME_DEADLINE_MODE : 0;
1976                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1977                                   SOF_TXTIME_REPORT_ERRORS : 0;
1978                 break;
1979
1980         case SO_BINDTOIFINDEX:
1981                 v.val = READ_ONCE(sk->sk_bound_dev_if);
1982                 break;
1983
1984         case SO_NETNS_COOKIE:
1985                 lv = sizeof(u64);
1986                 if (len != lv)
1987                         return -EINVAL;
1988                 v.val64 = sock_net(sk)->net_cookie;
1989                 break;
1990
1991         case SO_BUF_LOCK:
1992                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1993                 break;
1994
1995         case SO_RESERVE_MEM:
1996                 v.val = READ_ONCE(sk->sk_reserved_mem);
1997                 break;
1998
1999         case SO_TXREHASH:
2000                 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2001                 v.val = READ_ONCE(sk->sk_txrehash);
2002                 break;
2003
2004         default:
2005                 /* We implement the SO_SNDLOWAT etc to not be settable
2006                  * (1003.1g 7).
2007                  */
2008                 return -ENOPROTOOPT;
2009         }
2010
2011         if (len > lv)
2012                 len = lv;
2013         if (copy_to_sockptr(optval, &v, len))
2014                 return -EFAULT;
2015 lenout:
2016         if (copy_to_sockptr(optlen, &len, sizeof(int)))
2017                 return -EFAULT;
2018         return 0;
2019 }
2020
2021 int sock_getsockopt(struct socket *sock, int level, int optname,
2022                     char __user *optval, int __user *optlen)
2023 {
2024         return sk_getsockopt(sock->sk, level, optname,
2025                              USER_SOCKPTR(optval),
2026                              USER_SOCKPTR(optlen));
2027 }
2028
2029 /*
2030  * Initialize an sk_lock.
2031  *
2032  * (We also register the sk_lock with the lock validator.)
2033  */
2034 static inline void sock_lock_init(struct sock *sk)
2035 {
2036         if (sk->sk_kern_sock)
2037                 sock_lock_init_class_and_name(
2038                         sk,
2039                         af_family_kern_slock_key_strings[sk->sk_family],
2040                         af_family_kern_slock_keys + sk->sk_family,
2041                         af_family_kern_key_strings[sk->sk_family],
2042                         af_family_kern_keys + sk->sk_family);
2043         else
2044                 sock_lock_init_class_and_name(
2045                         sk,
2046                         af_family_slock_key_strings[sk->sk_family],
2047                         af_family_slock_keys + sk->sk_family,
2048                         af_family_key_strings[sk->sk_family],
2049                         af_family_keys + sk->sk_family);
2050 }
2051
2052 /*
2053  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2054  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2055  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2056  */
2057 static void sock_copy(struct sock *nsk, const struct sock *osk)
2058 {
2059         const struct proto *prot = READ_ONCE(osk->sk_prot);
2060 #ifdef CONFIG_SECURITY_NETWORK
2061         void *sptr = nsk->sk_security;
2062 #endif
2063
2064         /* If we move sk_tx_queue_mapping out of the private section,
2065          * we must check if sk_tx_queue_clear() is called after
2066          * sock_copy() in sk_clone_lock().
2067          */
2068         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2069                      offsetof(struct sock, sk_dontcopy_begin) ||
2070                      offsetof(struct sock, sk_tx_queue_mapping) >=
2071                      offsetof(struct sock, sk_dontcopy_end));
2072
2073         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2074
2075         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2076                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2077
2078 #ifdef CONFIG_SECURITY_NETWORK
2079         nsk->sk_security = sptr;
2080         security_sk_clone(osk, nsk);
2081 #endif
2082 }
2083
2084 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2085                 int family)
2086 {
2087         struct sock *sk;
2088         struct kmem_cache *slab;
2089
2090         slab = prot->slab;
2091         if (slab != NULL) {
2092                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2093                 if (!sk)
2094                         return sk;
2095                 if (want_init_on_alloc(priority))
2096                         sk_prot_clear_nulls(sk, prot->obj_size);
2097         } else
2098                 sk = kmalloc(prot->obj_size, priority);
2099
2100         if (sk != NULL) {
2101                 if (security_sk_alloc(sk, family, priority))
2102                         goto out_free;
2103
2104                 if (!try_module_get(prot->owner))
2105                         goto out_free_sec;
2106         }
2107
2108         return sk;
2109
2110 out_free_sec:
2111         security_sk_free(sk);
2112 out_free:
2113         if (slab != NULL)
2114                 kmem_cache_free(slab, sk);
2115         else
2116                 kfree(sk);
2117         return NULL;
2118 }
2119
2120 static void sk_prot_free(struct proto *prot, struct sock *sk)
2121 {
2122         struct kmem_cache *slab;
2123         struct module *owner;
2124
2125         owner = prot->owner;
2126         slab = prot->slab;
2127
2128         cgroup_sk_free(&sk->sk_cgrp_data);
2129         mem_cgroup_sk_free(sk);
2130         security_sk_free(sk);
2131         if (slab != NULL)
2132                 kmem_cache_free(slab, sk);
2133         else
2134                 kfree(sk);
2135         module_put(owner);
2136 }
2137
2138 /**
2139  *      sk_alloc - All socket objects are allocated here
2140  *      @net: the applicable net namespace
2141  *      @family: protocol family
2142  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2143  *      @prot: struct proto associated with this new sock instance
2144  *      @kern: is this to be a kernel socket?
2145  */
2146 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2147                       struct proto *prot, int kern)
2148 {
2149         struct sock *sk;
2150
2151         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2152         if (sk) {
2153                 sk->sk_family = family;
2154                 /*
2155                  * See comment in struct sock definition to understand
2156                  * why we need sk_prot_creator -acme
2157                  */
2158                 sk->sk_prot = sk->sk_prot_creator = prot;
2159                 sk->sk_kern_sock = kern;
2160                 sock_lock_init(sk);
2161                 sk->sk_net_refcnt = kern ? 0 : 1;
2162                 if (likely(sk->sk_net_refcnt)) {
2163                         get_net_track(net, &sk->ns_tracker, priority);
2164                         sock_inuse_add(net, 1);
2165                 } else {
2166                         __netns_tracker_alloc(net, &sk->ns_tracker,
2167                                               false, priority);
2168                 }
2169
2170                 sock_net_set(sk, net);
2171                 refcount_set(&sk->sk_wmem_alloc, 1);
2172
2173                 mem_cgroup_sk_alloc(sk);
2174                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2175                 sock_update_classid(&sk->sk_cgrp_data);
2176                 sock_update_netprioidx(&sk->sk_cgrp_data);
2177                 sk_tx_queue_clear(sk);
2178         }
2179
2180         return sk;
2181 }
2182 EXPORT_SYMBOL(sk_alloc);
2183
2184 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2185  * grace period. This is the case for UDP sockets and TCP listeners.
2186  */
2187 static void __sk_destruct(struct rcu_head *head)
2188 {
2189         struct sock *sk = container_of(head, struct sock, sk_rcu);
2190         struct sk_filter *filter;
2191
2192         if (sk->sk_destruct)
2193                 sk->sk_destruct(sk);
2194
2195         filter = rcu_dereference_check(sk->sk_filter,
2196                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2197         if (filter) {
2198                 sk_filter_uncharge(sk, filter);
2199                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2200         }
2201
2202         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2203
2204 #ifdef CONFIG_BPF_SYSCALL
2205         bpf_sk_storage_free(sk);
2206 #endif
2207
2208         if (atomic_read(&sk->sk_omem_alloc))
2209                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2210                          __func__, atomic_read(&sk->sk_omem_alloc));
2211
2212         if (sk->sk_frag.page) {
2213                 put_page(sk->sk_frag.page);
2214                 sk->sk_frag.page = NULL;
2215         }
2216
2217         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2218         put_cred(sk->sk_peer_cred);
2219         put_pid(sk->sk_peer_pid);
2220
2221         if (likely(sk->sk_net_refcnt))
2222                 put_net_track(sock_net(sk), &sk->ns_tracker);
2223         else
2224                 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2225
2226         sk_prot_free(sk->sk_prot_creator, sk);
2227 }
2228
2229 void sk_destruct(struct sock *sk)
2230 {
2231         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2232
2233         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2234                 reuseport_detach_sock(sk);
2235                 use_call_rcu = true;
2236         }
2237
2238         if (use_call_rcu)
2239                 call_rcu(&sk->sk_rcu, __sk_destruct);
2240         else
2241                 __sk_destruct(&sk->sk_rcu);
2242 }
2243
2244 static void __sk_free(struct sock *sk)
2245 {
2246         if (likely(sk->sk_net_refcnt))
2247                 sock_inuse_add(sock_net(sk), -1);
2248
2249         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2250                 sock_diag_broadcast_destroy(sk);
2251         else
2252                 sk_destruct(sk);
2253 }
2254
2255 void sk_free(struct sock *sk)
2256 {
2257         /*
2258          * We subtract one from sk_wmem_alloc and can know if
2259          * some packets are still in some tx queue.
2260          * If not null, sock_wfree() will call __sk_free(sk) later
2261          */
2262         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2263                 __sk_free(sk);
2264 }
2265 EXPORT_SYMBOL(sk_free);
2266
2267 static void sk_init_common(struct sock *sk)
2268 {
2269         skb_queue_head_init(&sk->sk_receive_queue);
2270         skb_queue_head_init(&sk->sk_write_queue);
2271         skb_queue_head_init(&sk->sk_error_queue);
2272
2273         rwlock_init(&sk->sk_callback_lock);
2274         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2275                         af_rlock_keys + sk->sk_family,
2276                         af_family_rlock_key_strings[sk->sk_family]);
2277         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2278                         af_wlock_keys + sk->sk_family,
2279                         af_family_wlock_key_strings[sk->sk_family]);
2280         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2281                         af_elock_keys + sk->sk_family,
2282                         af_family_elock_key_strings[sk->sk_family]);
2283         lockdep_set_class_and_name(&sk->sk_callback_lock,
2284                         af_callback_keys + sk->sk_family,
2285                         af_family_clock_key_strings[sk->sk_family]);
2286 }
2287
2288 /**
2289  *      sk_clone_lock - clone a socket, and lock its clone
2290  *      @sk: the socket to clone
2291  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2292  *
2293  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2294  */
2295 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2296 {
2297         struct proto *prot = READ_ONCE(sk->sk_prot);
2298         struct sk_filter *filter;
2299         bool is_charged = true;
2300         struct sock *newsk;
2301
2302         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2303         if (!newsk)
2304                 goto out;
2305
2306         sock_copy(newsk, sk);
2307
2308         newsk->sk_prot_creator = prot;
2309
2310         /* SANITY */
2311         if (likely(newsk->sk_net_refcnt)) {
2312                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2313                 sock_inuse_add(sock_net(newsk), 1);
2314         } else {
2315                 /* Kernel sockets are not elevating the struct net refcount.
2316                  * Instead, use a tracker to more easily detect if a layer
2317                  * is not properly dismantling its kernel sockets at netns
2318                  * destroy time.
2319                  */
2320                 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2321                                       false, priority);
2322         }
2323         sk_node_init(&newsk->sk_node);
2324         sock_lock_init(newsk);
2325         bh_lock_sock(newsk);
2326         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2327         newsk->sk_backlog.len = 0;
2328
2329         atomic_set(&newsk->sk_rmem_alloc, 0);
2330
2331         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2332         refcount_set(&newsk->sk_wmem_alloc, 1);
2333
2334         atomic_set(&newsk->sk_omem_alloc, 0);
2335         sk_init_common(newsk);
2336
2337         newsk->sk_dst_cache     = NULL;
2338         newsk->sk_dst_pending_confirm = 0;
2339         newsk->sk_wmem_queued   = 0;
2340         newsk->sk_forward_alloc = 0;
2341         newsk->sk_reserved_mem  = 0;
2342         atomic_set(&newsk->sk_drops, 0);
2343         newsk->sk_send_head     = NULL;
2344         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2345         atomic_set(&newsk->sk_zckey, 0);
2346
2347         sock_reset_flag(newsk, SOCK_DONE);
2348
2349         /* sk->sk_memcg will be populated at accept() time */
2350         newsk->sk_memcg = NULL;
2351
2352         cgroup_sk_clone(&newsk->sk_cgrp_data);
2353
2354         rcu_read_lock();
2355         filter = rcu_dereference(sk->sk_filter);
2356         if (filter != NULL)
2357                 /* though it's an empty new sock, the charging may fail
2358                  * if sysctl_optmem_max was changed between creation of
2359                  * original socket and cloning
2360                  */
2361                 is_charged = sk_filter_charge(newsk, filter);
2362         RCU_INIT_POINTER(newsk->sk_filter, filter);
2363         rcu_read_unlock();
2364
2365         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2366                 /* We need to make sure that we don't uncharge the new
2367                  * socket if we couldn't charge it in the first place
2368                  * as otherwise we uncharge the parent's filter.
2369                  */
2370                 if (!is_charged)
2371                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2372                 sk_free_unlock_clone(newsk);
2373                 newsk = NULL;
2374                 goto out;
2375         }
2376         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2377
2378         if (bpf_sk_storage_clone(sk, newsk)) {
2379                 sk_free_unlock_clone(newsk);
2380                 newsk = NULL;
2381                 goto out;
2382         }
2383
2384         /* Clear sk_user_data if parent had the pointer tagged
2385          * as not suitable for copying when cloning.
2386          */
2387         if (sk_user_data_is_nocopy(newsk))
2388                 newsk->sk_user_data = NULL;
2389
2390         newsk->sk_err      = 0;
2391         newsk->sk_err_soft = 0;
2392         newsk->sk_priority = 0;
2393         newsk->sk_incoming_cpu = raw_smp_processor_id();
2394
2395         /* Before updating sk_refcnt, we must commit prior changes to memory
2396          * (Documentation/RCU/rculist_nulls.rst for details)
2397          */
2398         smp_wmb();
2399         refcount_set(&newsk->sk_refcnt, 2);
2400
2401         sk_set_socket(newsk, NULL);
2402         sk_tx_queue_clear(newsk);
2403         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2404
2405         if (newsk->sk_prot->sockets_allocated)
2406                 sk_sockets_allocated_inc(newsk);
2407
2408         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2409                 net_enable_timestamp();
2410 out:
2411         return newsk;
2412 }
2413 EXPORT_SYMBOL_GPL(sk_clone_lock);
2414
2415 void sk_free_unlock_clone(struct sock *sk)
2416 {
2417         /* It is still raw copy of parent, so invalidate
2418          * destructor and make plain sk_free() */
2419         sk->sk_destruct = NULL;
2420         bh_unlock_sock(sk);
2421         sk_free(sk);
2422 }
2423 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2424
2425 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2426 {
2427         bool is_ipv6 = false;
2428         u32 max_size;
2429
2430 #if IS_ENABLED(CONFIG_IPV6)
2431         is_ipv6 = (sk->sk_family == AF_INET6 &&
2432                    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2433 #endif
2434         /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2435         max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2436                         READ_ONCE(dst->dev->gso_ipv4_max_size);
2437         if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2438                 max_size = GSO_LEGACY_MAX_SIZE;
2439
2440         return max_size - (MAX_TCP_HEADER + 1);
2441 }
2442
2443 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2444 {
2445         u32 max_segs = 1;
2446
2447         sk->sk_route_caps = dst->dev->features;
2448         if (sk_is_tcp(sk))
2449                 sk->sk_route_caps |= NETIF_F_GSO;
2450         if (sk->sk_route_caps & NETIF_F_GSO)
2451                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2452         if (unlikely(sk->sk_gso_disabled))
2453                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2454         if (sk_can_gso(sk)) {
2455                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2456                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2457                 } else {
2458                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2459                         sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2460                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2461                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2462                 }
2463         }
2464         sk->sk_gso_max_segs = max_segs;
2465         sk_dst_set(sk, dst);
2466 }
2467 EXPORT_SYMBOL_GPL(sk_setup_caps);
2468
2469 /*
2470  *      Simple resource managers for sockets.
2471  */
2472
2473
2474 /*
2475  * Write buffer destructor automatically called from kfree_skb.
2476  */
2477 void sock_wfree(struct sk_buff *skb)
2478 {
2479         struct sock *sk = skb->sk;
2480         unsigned int len = skb->truesize;
2481         bool free;
2482
2483         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2484                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2485                     sk->sk_write_space == sock_def_write_space) {
2486                         rcu_read_lock();
2487                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2488                         sock_def_write_space_wfree(sk);
2489                         rcu_read_unlock();
2490                         if (unlikely(free))
2491                                 __sk_free(sk);
2492                         return;
2493                 }
2494
2495                 /*
2496                  * Keep a reference on sk_wmem_alloc, this will be released
2497                  * after sk_write_space() call
2498                  */
2499                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2500                 sk->sk_write_space(sk);
2501                 len = 1;
2502         }
2503         /*
2504          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2505          * could not do because of in-flight packets
2506          */
2507         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2508                 __sk_free(sk);
2509 }
2510 EXPORT_SYMBOL(sock_wfree);
2511
2512 /* This variant of sock_wfree() is used by TCP,
2513  * since it sets SOCK_USE_WRITE_QUEUE.
2514  */
2515 void __sock_wfree(struct sk_buff *skb)
2516 {
2517         struct sock *sk = skb->sk;
2518
2519         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2520                 __sk_free(sk);
2521 }
2522
2523 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2524 {
2525         skb_orphan(skb);
2526         skb->sk = sk;
2527 #ifdef CONFIG_INET
2528         if (unlikely(!sk_fullsock(sk))) {
2529                 skb->destructor = sock_edemux;
2530                 sock_hold(sk);
2531                 return;
2532         }
2533 #endif
2534         skb->destructor = sock_wfree;
2535         skb_set_hash_from_sk(skb, sk);
2536         /*
2537          * We used to take a refcount on sk, but following operation
2538          * is enough to guarantee sk_free() wont free this sock until
2539          * all in-flight packets are completed
2540          */
2541         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2542 }
2543 EXPORT_SYMBOL(skb_set_owner_w);
2544
2545 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2546 {
2547 #ifdef CONFIG_TLS_DEVICE
2548         /* Drivers depend on in-order delivery for crypto offload,
2549          * partial orphan breaks out-of-order-OK logic.
2550          */
2551         if (skb->decrypted)
2552                 return false;
2553 #endif
2554         return (skb->destructor == sock_wfree ||
2555                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2556 }
2557
2558 /* This helper is used by netem, as it can hold packets in its
2559  * delay queue. We want to allow the owner socket to send more
2560  * packets, as if they were already TX completed by a typical driver.
2561  * But we also want to keep skb->sk set because some packet schedulers
2562  * rely on it (sch_fq for example).
2563  */
2564 void skb_orphan_partial(struct sk_buff *skb)
2565 {
2566         if (skb_is_tcp_pure_ack(skb))
2567                 return;
2568
2569         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2570                 return;
2571
2572         skb_orphan(skb);
2573 }
2574 EXPORT_SYMBOL(skb_orphan_partial);
2575
2576 /*
2577  * Read buffer destructor automatically called from kfree_skb.
2578  */
2579 void sock_rfree(struct sk_buff *skb)
2580 {
2581         struct sock *sk = skb->sk;
2582         unsigned int len = skb->truesize;
2583
2584         atomic_sub(len, &sk->sk_rmem_alloc);
2585         sk_mem_uncharge(sk, len);
2586 }
2587 EXPORT_SYMBOL(sock_rfree);
2588
2589 /*
2590  * Buffer destructor for skbs that are not used directly in read or write
2591  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2592  */
2593 void sock_efree(struct sk_buff *skb)
2594 {
2595         sock_put(skb->sk);
2596 }
2597 EXPORT_SYMBOL(sock_efree);
2598
2599 /* Buffer destructor for prefetch/receive path where reference count may
2600  * not be held, e.g. for listen sockets.
2601  */
2602 #ifdef CONFIG_INET
2603 void sock_pfree(struct sk_buff *skb)
2604 {
2605         if (sk_is_refcounted(skb->sk))
2606                 sock_gen_put(skb->sk);
2607 }
2608 EXPORT_SYMBOL(sock_pfree);
2609 #endif /* CONFIG_INET */
2610
2611 kuid_t sock_i_uid(struct sock *sk)
2612 {
2613         kuid_t uid;
2614
2615         read_lock_bh(&sk->sk_callback_lock);
2616         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2617         read_unlock_bh(&sk->sk_callback_lock);
2618         return uid;
2619 }
2620 EXPORT_SYMBOL(sock_i_uid);
2621
2622 unsigned long __sock_i_ino(struct sock *sk)
2623 {
2624         unsigned long ino;
2625
2626         read_lock(&sk->sk_callback_lock);
2627         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2628         read_unlock(&sk->sk_callback_lock);
2629         return ino;
2630 }
2631 EXPORT_SYMBOL(__sock_i_ino);
2632
2633 unsigned long sock_i_ino(struct sock *sk)
2634 {
2635         unsigned long ino;
2636
2637         local_bh_disable();
2638         ino = __sock_i_ino(sk);
2639         local_bh_enable();
2640         return ino;
2641 }
2642 EXPORT_SYMBOL(sock_i_ino);
2643
2644 /*
2645  * Allocate a skb from the socket's send buffer.
2646  */
2647 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2648                              gfp_t priority)
2649 {
2650         if (force ||
2651             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2652                 struct sk_buff *skb = alloc_skb(size, priority);
2653
2654                 if (skb) {
2655                         skb_set_owner_w(skb, sk);
2656                         return skb;
2657                 }
2658         }
2659         return NULL;
2660 }
2661 EXPORT_SYMBOL(sock_wmalloc);
2662
2663 static void sock_ofree(struct sk_buff *skb)
2664 {
2665         struct sock *sk = skb->sk;
2666
2667         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2668 }
2669
2670 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2671                              gfp_t priority)
2672 {
2673         struct sk_buff *skb;
2674
2675         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2676         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2677             READ_ONCE(sysctl_optmem_max))
2678                 return NULL;
2679
2680         skb = alloc_skb(size, priority);
2681         if (!skb)
2682                 return NULL;
2683
2684         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2685         skb->sk = sk;
2686         skb->destructor = sock_ofree;
2687         return skb;
2688 }
2689
2690 /*
2691  * Allocate a memory block from the socket's option memory buffer.
2692  */
2693 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2694 {
2695         int optmem_max = READ_ONCE(sysctl_optmem_max);
2696
2697         if ((unsigned int)size <= optmem_max &&
2698             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2699                 void *mem;
2700                 /* First do the add, to avoid the race if kmalloc
2701                  * might sleep.
2702                  */
2703                 atomic_add(size, &sk->sk_omem_alloc);
2704                 mem = kmalloc(size, priority);
2705                 if (mem)
2706                         return mem;
2707                 atomic_sub(size, &sk->sk_omem_alloc);
2708         }
2709         return NULL;
2710 }
2711 EXPORT_SYMBOL(sock_kmalloc);
2712
2713 /* Free an option memory block. Note, we actually want the inline
2714  * here as this allows gcc to detect the nullify and fold away the
2715  * condition entirely.
2716  */
2717 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2718                                   const bool nullify)
2719 {
2720         if (WARN_ON_ONCE(!mem))
2721                 return;
2722         if (nullify)
2723                 kfree_sensitive(mem);
2724         else
2725                 kfree(mem);
2726         atomic_sub(size, &sk->sk_omem_alloc);
2727 }
2728
2729 void sock_kfree_s(struct sock *sk, void *mem, int size)
2730 {
2731         __sock_kfree_s(sk, mem, size, false);
2732 }
2733 EXPORT_SYMBOL(sock_kfree_s);
2734
2735 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2736 {
2737         __sock_kfree_s(sk, mem, size, true);
2738 }
2739 EXPORT_SYMBOL(sock_kzfree_s);
2740
2741 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2742    I think, these locks should be removed for datagram sockets.
2743  */
2744 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2745 {
2746         DEFINE_WAIT(wait);
2747
2748         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2749         for (;;) {
2750                 if (!timeo)
2751                         break;
2752                 if (signal_pending(current))
2753                         break;
2754                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2755                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2756                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2757                         break;
2758                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2759                         break;
2760                 if (READ_ONCE(sk->sk_err))
2761                         break;
2762                 timeo = schedule_timeout(timeo);
2763         }
2764         finish_wait(sk_sleep(sk), &wait);
2765         return timeo;
2766 }
2767
2768
2769 /*
2770  *      Generic send/receive buffer handlers
2771  */
2772
2773 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2774                                      unsigned long data_len, int noblock,
2775                                      int *errcode, int max_page_order)
2776 {
2777         struct sk_buff *skb;
2778         long timeo;
2779         int err;
2780
2781         timeo = sock_sndtimeo(sk, noblock);
2782         for (;;) {
2783                 err = sock_error(sk);
2784                 if (err != 0)
2785                         goto failure;
2786
2787                 err = -EPIPE;
2788                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2789                         goto failure;
2790
2791                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2792                         break;
2793
2794                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2795                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2796                 err = -EAGAIN;
2797                 if (!timeo)
2798                         goto failure;
2799                 if (signal_pending(current))
2800                         goto interrupted;
2801                 timeo = sock_wait_for_wmem(sk, timeo);
2802         }
2803         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2804                                    errcode, sk->sk_allocation);
2805         if (skb)
2806                 skb_set_owner_w(skb, sk);
2807         return skb;
2808
2809 interrupted:
2810         err = sock_intr_errno(timeo);
2811 failure:
2812         *errcode = err;
2813         return NULL;
2814 }
2815 EXPORT_SYMBOL(sock_alloc_send_pskb);
2816
2817 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2818                      struct sockcm_cookie *sockc)
2819 {
2820         u32 tsflags;
2821
2822         switch (cmsg->cmsg_type) {
2823         case SO_MARK:
2824                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2825                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2826                         return -EPERM;
2827                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2828                         return -EINVAL;
2829                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2830                 break;
2831         case SO_TIMESTAMPING_OLD:
2832         case SO_TIMESTAMPING_NEW:
2833                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2834                         return -EINVAL;
2835
2836                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2837                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2838                         return -EINVAL;
2839
2840                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2841                 sockc->tsflags |= tsflags;
2842                 break;
2843         case SCM_TXTIME:
2844                 if (!sock_flag(sk, SOCK_TXTIME))
2845                         return -EINVAL;
2846                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2847                         return -EINVAL;
2848                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2849                 break;
2850         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2851         case SCM_RIGHTS:
2852         case SCM_CREDENTIALS:
2853                 break;
2854         default:
2855                 return -EINVAL;
2856         }
2857         return 0;
2858 }
2859 EXPORT_SYMBOL(__sock_cmsg_send);
2860
2861 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2862                    struct sockcm_cookie *sockc)
2863 {
2864         struct cmsghdr *cmsg;
2865         int ret;
2866
2867         for_each_cmsghdr(cmsg, msg) {
2868                 if (!CMSG_OK(msg, cmsg))
2869                         return -EINVAL;
2870                 if (cmsg->cmsg_level != SOL_SOCKET)
2871                         continue;
2872                 ret = __sock_cmsg_send(sk, cmsg, sockc);
2873                 if (ret)
2874                         return ret;
2875         }
2876         return 0;
2877 }
2878 EXPORT_SYMBOL(sock_cmsg_send);
2879
2880 static void sk_enter_memory_pressure(struct sock *sk)
2881 {
2882         if (!sk->sk_prot->enter_memory_pressure)
2883                 return;
2884
2885         sk->sk_prot->enter_memory_pressure(sk);
2886 }
2887
2888 static void sk_leave_memory_pressure(struct sock *sk)
2889 {
2890         if (sk->sk_prot->leave_memory_pressure) {
2891                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2892                                      tcp_leave_memory_pressure, sk);
2893         } else {
2894                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2895
2896                 if (memory_pressure && READ_ONCE(*memory_pressure))
2897                         WRITE_ONCE(*memory_pressure, 0);
2898         }
2899 }
2900
2901 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2902
2903 /**
2904  * skb_page_frag_refill - check that a page_frag contains enough room
2905  * @sz: minimum size of the fragment we want to get
2906  * @pfrag: pointer to page_frag
2907  * @gfp: priority for memory allocation
2908  *
2909  * Note: While this allocator tries to use high order pages, there is
2910  * no guarantee that allocations succeed. Therefore, @sz MUST be
2911  * less or equal than PAGE_SIZE.
2912  */
2913 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2914 {
2915         if (pfrag->page) {
2916                 if (page_ref_count(pfrag->page) == 1) {
2917                         pfrag->offset = 0;
2918                         return true;
2919                 }
2920                 if (pfrag->offset + sz <= pfrag->size)
2921                         return true;
2922                 put_page(pfrag->page);
2923         }
2924
2925         pfrag->offset = 0;
2926         if (SKB_FRAG_PAGE_ORDER &&
2927             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2928                 /* Avoid direct reclaim but allow kswapd to wake */
2929                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2930                                           __GFP_COMP | __GFP_NOWARN |
2931                                           __GFP_NORETRY,
2932                                           SKB_FRAG_PAGE_ORDER);
2933                 if (likely(pfrag->page)) {
2934                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2935                         return true;
2936                 }
2937         }
2938         pfrag->page = alloc_page(gfp);
2939         if (likely(pfrag->page)) {
2940                 pfrag->size = PAGE_SIZE;
2941                 return true;
2942         }
2943         return false;
2944 }
2945 EXPORT_SYMBOL(skb_page_frag_refill);
2946
2947 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2948 {
2949         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2950                 return true;
2951
2952         sk_enter_memory_pressure(sk);
2953         sk_stream_moderate_sndbuf(sk);
2954         return false;
2955 }
2956 EXPORT_SYMBOL(sk_page_frag_refill);
2957
2958 void __lock_sock(struct sock *sk)
2959         __releases(&sk->sk_lock.slock)
2960         __acquires(&sk->sk_lock.slock)
2961 {
2962         DEFINE_WAIT(wait);
2963
2964         for (;;) {
2965                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2966                                         TASK_UNINTERRUPTIBLE);
2967                 spin_unlock_bh(&sk->sk_lock.slock);
2968                 schedule();
2969                 spin_lock_bh(&sk->sk_lock.slock);
2970                 if (!sock_owned_by_user(sk))
2971                         break;
2972         }
2973         finish_wait(&sk->sk_lock.wq, &wait);
2974 }
2975
2976 void __release_sock(struct sock *sk)
2977         __releases(&sk->sk_lock.slock)
2978         __acquires(&sk->sk_lock.slock)
2979 {
2980         struct sk_buff *skb, *next;
2981
2982         while ((skb = sk->sk_backlog.head) != NULL) {
2983                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2984
2985                 spin_unlock_bh(&sk->sk_lock.slock);
2986
2987                 do {
2988                         next = skb->next;
2989                         prefetch(next);
2990                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2991                         skb_mark_not_on_list(skb);
2992                         sk_backlog_rcv(sk, skb);
2993
2994                         cond_resched();
2995
2996                         skb = next;
2997                 } while (skb != NULL);
2998
2999                 spin_lock_bh(&sk->sk_lock.slock);
3000         }
3001
3002         /*
3003          * Doing the zeroing here guarantee we can not loop forever
3004          * while a wild producer attempts to flood us.
3005          */
3006         sk->sk_backlog.len = 0;
3007 }
3008
3009 void __sk_flush_backlog(struct sock *sk)
3010 {
3011         spin_lock_bh(&sk->sk_lock.slock);
3012         __release_sock(sk);
3013         spin_unlock_bh(&sk->sk_lock.slock);
3014 }
3015 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3016
3017 /**
3018  * sk_wait_data - wait for data to arrive at sk_receive_queue
3019  * @sk:    sock to wait on
3020  * @timeo: for how long
3021  * @skb:   last skb seen on sk_receive_queue
3022  *
3023  * Now socket state including sk->sk_err is changed only under lock,
3024  * hence we may omit checks after joining wait queue.
3025  * We check receive queue before schedule() only as optimization;
3026  * it is very likely that release_sock() added new data.
3027  */
3028 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3029 {
3030         DEFINE_WAIT_FUNC(wait, woken_wake_function);
3031         int rc;
3032
3033         add_wait_queue(sk_sleep(sk), &wait);
3034         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3035         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3036         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3037         remove_wait_queue(sk_sleep(sk), &wait);
3038         return rc;
3039 }
3040 EXPORT_SYMBOL(sk_wait_data);
3041
3042 /**
3043  *      __sk_mem_raise_allocated - increase memory_allocated
3044  *      @sk: socket
3045  *      @size: memory size to allocate
3046  *      @amt: pages to allocate
3047  *      @kind: allocation type
3048  *
3049  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3050  */
3051 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3052 {
3053         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3054         struct proto *prot = sk->sk_prot;
3055         bool charged = true;
3056         long allocated;
3057
3058         sk_memory_allocated_add(sk, amt);
3059         allocated = sk_memory_allocated(sk);
3060         if (memcg_charge &&
3061             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3062                                                 gfp_memcg_charge())))
3063                 goto suppress_allocation;
3064
3065         /* Under limit. */
3066         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3067                 sk_leave_memory_pressure(sk);
3068                 return 1;
3069         }
3070
3071         /* Under pressure. */
3072         if (allocated > sk_prot_mem_limits(sk, 1))
3073                 sk_enter_memory_pressure(sk);
3074
3075         /* Over hard limit. */
3076         if (allocated > sk_prot_mem_limits(sk, 2))
3077                 goto suppress_allocation;
3078
3079         /* guarantee minimum buffer size under pressure */
3080         if (kind == SK_MEM_RECV) {
3081                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3082                         return 1;
3083
3084         } else { /* SK_MEM_SEND */
3085                 int wmem0 = sk_get_wmem0(sk, prot);
3086
3087                 if (sk->sk_type == SOCK_STREAM) {
3088                         if (sk->sk_wmem_queued < wmem0)
3089                                 return 1;
3090                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3091                                 return 1;
3092                 }
3093         }
3094
3095         if (sk_has_memory_pressure(sk)) {
3096                 u64 alloc;
3097
3098                 if (!sk_under_memory_pressure(sk))
3099                         return 1;
3100                 alloc = sk_sockets_allocated_read_positive(sk);
3101                 if (sk_prot_mem_limits(sk, 2) > alloc *
3102                     sk_mem_pages(sk->sk_wmem_queued +
3103                                  atomic_read(&sk->sk_rmem_alloc) +
3104                                  sk->sk_forward_alloc))
3105                         return 1;
3106         }
3107
3108 suppress_allocation:
3109
3110         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3111                 sk_stream_moderate_sndbuf(sk);
3112
3113                 /* Fail only if socket is _under_ its sndbuf.
3114                  * In this case we cannot block, so that we have to fail.
3115                  */
3116                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3117                         /* Force charge with __GFP_NOFAIL */
3118                         if (memcg_charge && !charged) {
3119                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3120                                         gfp_memcg_charge() | __GFP_NOFAIL);
3121                         }
3122                         return 1;
3123                 }
3124         }
3125
3126         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3127                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3128
3129         sk_memory_allocated_sub(sk, amt);
3130
3131         if (memcg_charge && charged)
3132                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3133
3134         return 0;
3135 }
3136
3137 /**
3138  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3139  *      @sk: socket
3140  *      @size: memory size to allocate
3141  *      @kind: allocation type
3142  *
3143  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3144  *      rmem allocation. This function assumes that protocols which have
3145  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3146  */
3147 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3148 {
3149         int ret, amt = sk_mem_pages(size);
3150
3151         sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3152         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3153         if (!ret)
3154                 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3155         return ret;
3156 }
3157 EXPORT_SYMBOL(__sk_mem_schedule);
3158
3159 /**
3160  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3161  *      @sk: socket
3162  *      @amount: number of quanta
3163  *
3164  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3165  */
3166 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3167 {
3168         sk_memory_allocated_sub(sk, amount);
3169
3170         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3171                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3172
3173         if (sk_under_global_memory_pressure(sk) &&
3174             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3175                 sk_leave_memory_pressure(sk);
3176 }
3177
3178 /**
3179  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3180  *      @sk: socket
3181  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3182  */
3183 void __sk_mem_reclaim(struct sock *sk, int amount)
3184 {
3185         amount >>= PAGE_SHIFT;
3186         sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3187         __sk_mem_reduce_allocated(sk, amount);
3188 }
3189 EXPORT_SYMBOL(__sk_mem_reclaim);
3190
3191 int sk_set_peek_off(struct sock *sk, int val)
3192 {
3193         WRITE_ONCE(sk->sk_peek_off, val);
3194         return 0;
3195 }
3196 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3197
3198 /*
3199  * Set of default routines for initialising struct proto_ops when
3200  * the protocol does not support a particular function. In certain
3201  * cases where it makes no sense for a protocol to have a "do nothing"
3202  * function, some default processing is provided.
3203  */
3204
3205 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3206 {
3207         return -EOPNOTSUPP;
3208 }
3209 EXPORT_SYMBOL(sock_no_bind);
3210
3211 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3212                     int len, int flags)
3213 {
3214         return -EOPNOTSUPP;
3215 }
3216 EXPORT_SYMBOL(sock_no_connect);
3217
3218 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3219 {
3220         return -EOPNOTSUPP;
3221 }
3222 EXPORT_SYMBOL(sock_no_socketpair);
3223
3224 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3225                    bool kern)
3226 {
3227         return -EOPNOTSUPP;
3228 }
3229 EXPORT_SYMBOL(sock_no_accept);
3230
3231 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3232                     int peer)
3233 {
3234         return -EOPNOTSUPP;
3235 }
3236 EXPORT_SYMBOL(sock_no_getname);
3237
3238 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3239 {
3240         return -EOPNOTSUPP;
3241 }
3242 EXPORT_SYMBOL(sock_no_ioctl);
3243
3244 int sock_no_listen(struct socket *sock, int backlog)
3245 {
3246         return -EOPNOTSUPP;
3247 }
3248 EXPORT_SYMBOL(sock_no_listen);
3249
3250 int sock_no_shutdown(struct socket *sock, int how)
3251 {
3252         return -EOPNOTSUPP;
3253 }
3254 EXPORT_SYMBOL(sock_no_shutdown);
3255
3256 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3257 {
3258         return -EOPNOTSUPP;
3259 }
3260 EXPORT_SYMBOL(sock_no_sendmsg);
3261
3262 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3263 {
3264         return -EOPNOTSUPP;
3265 }
3266 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3267
3268 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3269                     int flags)
3270 {
3271         return -EOPNOTSUPP;
3272 }
3273 EXPORT_SYMBOL(sock_no_recvmsg);
3274
3275 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3276 {
3277         /* Mirror missing mmap method error code */
3278         return -ENODEV;
3279 }
3280 EXPORT_SYMBOL(sock_no_mmap);
3281
3282 /*
3283  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3284  * various sock-based usage counts.
3285  */
3286 void __receive_sock(struct file *file)
3287 {
3288         struct socket *sock;
3289
3290         sock = sock_from_file(file);
3291         if (sock) {
3292                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3293                 sock_update_classid(&sock->sk->sk_cgrp_data);
3294         }
3295 }
3296
3297 /*
3298  *      Default Socket Callbacks
3299  */
3300
3301 static void sock_def_wakeup(struct sock *sk)
3302 {
3303         struct socket_wq *wq;
3304
3305         rcu_read_lock();
3306         wq = rcu_dereference(sk->sk_wq);
3307         if (skwq_has_sleeper(wq))
3308                 wake_up_interruptible_all(&wq->wait);
3309         rcu_read_unlock();
3310 }
3311
3312 static void sock_def_error_report(struct sock *sk)
3313 {
3314         struct socket_wq *wq;
3315
3316         rcu_read_lock();
3317         wq = rcu_dereference(sk->sk_wq);
3318         if (skwq_has_sleeper(wq))
3319                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3320         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3321         rcu_read_unlock();
3322 }
3323
3324 void sock_def_readable(struct sock *sk)
3325 {
3326         struct socket_wq *wq;
3327
3328         trace_sk_data_ready(sk);
3329
3330         rcu_read_lock();
3331         wq = rcu_dereference(sk->sk_wq);
3332         if (skwq_has_sleeper(wq))
3333                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3334                                                 EPOLLRDNORM | EPOLLRDBAND);
3335         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3336         rcu_read_unlock();
3337 }
3338
3339 static void sock_def_write_space(struct sock *sk)
3340 {
3341         struct socket_wq *wq;
3342
3343         rcu_read_lock();
3344
3345         /* Do not wake up a writer until he can make "significant"
3346          * progress.  --DaveM
3347          */
3348         if (sock_writeable(sk)) {
3349                 wq = rcu_dereference(sk->sk_wq);
3350                 if (skwq_has_sleeper(wq))
3351                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3352                                                 EPOLLWRNORM | EPOLLWRBAND);
3353
3354                 /* Should agree with poll, otherwise some programs break */
3355                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3356         }
3357
3358         rcu_read_unlock();
3359 }
3360
3361 /* An optimised version of sock_def_write_space(), should only be called
3362  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3363  * ->sk_wmem_alloc.
3364  */
3365 static void sock_def_write_space_wfree(struct sock *sk)
3366 {
3367         /* Do not wake up a writer until he can make "significant"
3368          * progress.  --DaveM
3369          */
3370         if (sock_writeable(sk)) {
3371                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3372
3373                 /* rely on refcount_sub from sock_wfree() */
3374                 smp_mb__after_atomic();
3375                 if (wq && waitqueue_active(&wq->wait))
3376                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3377                                                 EPOLLWRNORM | EPOLLWRBAND);
3378
3379                 /* Should agree with poll, otherwise some programs break */
3380                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3381         }
3382 }
3383
3384 static void sock_def_destruct(struct sock *sk)
3385 {
3386 }
3387
3388 void sk_send_sigurg(struct sock *sk)
3389 {
3390         if (sk->sk_socket && sk->sk_socket->file)
3391                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3392                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3393 }
3394 EXPORT_SYMBOL(sk_send_sigurg);
3395
3396 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3397                     unsigned long expires)
3398 {
3399         if (!mod_timer(timer, expires))
3400                 sock_hold(sk);
3401 }
3402 EXPORT_SYMBOL(sk_reset_timer);
3403
3404 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3405 {
3406         if (del_timer(timer))
3407                 __sock_put(sk);
3408 }
3409 EXPORT_SYMBOL(sk_stop_timer);
3410
3411 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3412 {
3413         if (del_timer_sync(timer))
3414                 __sock_put(sk);
3415 }
3416 EXPORT_SYMBOL(sk_stop_timer_sync);
3417
3418 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3419 {
3420         sk_init_common(sk);
3421         sk->sk_send_head        =       NULL;
3422
3423         timer_setup(&sk->sk_timer, NULL, 0);
3424
3425         sk->sk_allocation       =       GFP_KERNEL;
3426         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3427         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3428         sk->sk_state            =       TCP_CLOSE;
3429         sk->sk_use_task_frag    =       true;
3430         sk_set_socket(sk, sock);
3431
3432         sock_set_flag(sk, SOCK_ZAPPED);
3433
3434         if (sock) {
3435                 sk->sk_type     =       sock->type;
3436                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3437                 sock->sk        =       sk;
3438         } else {
3439                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3440         }
3441         sk->sk_uid      =       uid;
3442
3443         rwlock_init(&sk->sk_callback_lock);
3444         if (sk->sk_kern_sock)
3445                 lockdep_set_class_and_name(
3446                         &sk->sk_callback_lock,
3447                         af_kern_callback_keys + sk->sk_family,
3448                         af_family_kern_clock_key_strings[sk->sk_family]);
3449         else
3450                 lockdep_set_class_and_name(
3451                         &sk->sk_callback_lock,
3452                         af_callback_keys + sk->sk_family,
3453                         af_family_clock_key_strings[sk->sk_family]);
3454
3455         sk->sk_state_change     =       sock_def_wakeup;
3456         sk->sk_data_ready       =       sock_def_readable;
3457         sk->sk_write_space      =       sock_def_write_space;
3458         sk->sk_error_report     =       sock_def_error_report;
3459         sk->sk_destruct         =       sock_def_destruct;
3460
3461         sk->sk_frag.page        =       NULL;
3462         sk->sk_frag.offset      =       0;
3463         sk->sk_peek_off         =       -1;
3464
3465         sk->sk_peer_pid         =       NULL;
3466         sk->sk_peer_cred        =       NULL;
3467         spin_lock_init(&sk->sk_peer_lock);
3468
3469         sk->sk_write_pending    =       0;
3470         sk->sk_rcvlowat         =       1;
3471         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3472         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3473
3474         sk->sk_stamp = SK_DEFAULT_STAMP;
3475 #if BITS_PER_LONG==32
3476         seqlock_init(&sk->sk_stamp_seq);
3477 #endif
3478         atomic_set(&sk->sk_zckey, 0);
3479
3480 #ifdef CONFIG_NET_RX_BUSY_POLL
3481         sk->sk_napi_id          =       0;
3482         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3483 #endif
3484
3485         sk->sk_max_pacing_rate = ~0UL;
3486         sk->sk_pacing_rate = ~0UL;
3487         WRITE_ONCE(sk->sk_pacing_shift, 10);
3488         sk->sk_incoming_cpu = -1;
3489
3490         sk_rx_queue_clear(sk);
3491         /*
3492          * Before updating sk_refcnt, we must commit prior changes to memory
3493          * (Documentation/RCU/rculist_nulls.rst for details)
3494          */
3495         smp_wmb();
3496         refcount_set(&sk->sk_refcnt, 1);
3497         atomic_set(&sk->sk_drops, 0);
3498 }
3499 EXPORT_SYMBOL(sock_init_data_uid);
3500
3501 void sock_init_data(struct socket *sock, struct sock *sk)
3502 {
3503         kuid_t uid = sock ?
3504                 SOCK_INODE(sock)->i_uid :
3505                 make_kuid(sock_net(sk)->user_ns, 0);
3506
3507         sock_init_data_uid(sock, sk, uid);
3508 }
3509 EXPORT_SYMBOL(sock_init_data);
3510
3511 void lock_sock_nested(struct sock *sk, int subclass)
3512 {
3513         /* The sk_lock has mutex_lock() semantics here. */
3514         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3515
3516         might_sleep();
3517         spin_lock_bh(&sk->sk_lock.slock);
3518         if (sock_owned_by_user_nocheck(sk))
3519                 __lock_sock(sk);
3520         sk->sk_lock.owned = 1;
3521         spin_unlock_bh(&sk->sk_lock.slock);
3522 }
3523 EXPORT_SYMBOL(lock_sock_nested);
3524
3525 void release_sock(struct sock *sk)
3526 {
3527         spin_lock_bh(&sk->sk_lock.slock);
3528         if (sk->sk_backlog.tail)
3529                 __release_sock(sk);
3530
3531         /* Warning : release_cb() might need to release sk ownership,
3532          * ie call sock_release_ownership(sk) before us.
3533          */
3534         if (sk->sk_prot->release_cb)
3535                 sk->sk_prot->release_cb(sk);
3536
3537         sock_release_ownership(sk);
3538         if (waitqueue_active(&sk->sk_lock.wq))
3539                 wake_up(&sk->sk_lock.wq);
3540         spin_unlock_bh(&sk->sk_lock.slock);
3541 }
3542 EXPORT_SYMBOL(release_sock);
3543
3544 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3545 {
3546         might_sleep();
3547         spin_lock_bh(&sk->sk_lock.slock);
3548
3549         if (!sock_owned_by_user_nocheck(sk)) {
3550                 /*
3551                  * Fast path return with bottom halves disabled and
3552                  * sock::sk_lock.slock held.
3553                  *
3554                  * The 'mutex' is not contended and holding
3555                  * sock::sk_lock.slock prevents all other lockers to
3556                  * proceed so the corresponding unlock_sock_fast() can
3557                  * avoid the slow path of release_sock() completely and
3558                  * just release slock.
3559                  *
3560                  * From a semantical POV this is equivalent to 'acquiring'
3561                  * the 'mutex', hence the corresponding lockdep
3562                  * mutex_release() has to happen in the fast path of
3563                  * unlock_sock_fast().
3564                  */
3565                 return false;
3566         }
3567
3568         __lock_sock(sk);
3569         sk->sk_lock.owned = 1;
3570         __acquire(&sk->sk_lock.slock);
3571         spin_unlock_bh(&sk->sk_lock.slock);
3572         return true;
3573 }
3574 EXPORT_SYMBOL(__lock_sock_fast);
3575
3576 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3577                    bool timeval, bool time32)
3578 {
3579         struct sock *sk = sock->sk;
3580         struct timespec64 ts;
3581
3582         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3583         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3584         if (ts.tv_sec == -1)
3585                 return -ENOENT;
3586         if (ts.tv_sec == 0) {
3587                 ktime_t kt = ktime_get_real();
3588                 sock_write_timestamp(sk, kt);
3589                 ts = ktime_to_timespec64(kt);
3590         }
3591
3592         if (timeval)
3593                 ts.tv_nsec /= 1000;
3594
3595 #ifdef CONFIG_COMPAT_32BIT_TIME
3596         if (time32)
3597                 return put_old_timespec32(&ts, userstamp);
3598 #endif
3599 #ifdef CONFIG_SPARC64
3600         /* beware of padding in sparc64 timeval */
3601         if (timeval && !in_compat_syscall()) {
3602                 struct __kernel_old_timeval __user tv = {
3603                         .tv_sec = ts.tv_sec,
3604                         .tv_usec = ts.tv_nsec,
3605                 };
3606                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3607                         return -EFAULT;
3608                 return 0;
3609         }
3610 #endif
3611         return put_timespec64(&ts, userstamp);
3612 }
3613 EXPORT_SYMBOL(sock_gettstamp);
3614
3615 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3616 {
3617         if (!sock_flag(sk, flag)) {
3618                 unsigned long previous_flags = sk->sk_flags;
3619
3620                 sock_set_flag(sk, flag);
3621                 /*
3622                  * we just set one of the two flags which require net
3623                  * time stamping, but time stamping might have been on
3624                  * already because of the other one
3625                  */
3626                 if (sock_needs_netstamp(sk) &&
3627                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3628                         net_enable_timestamp();
3629         }
3630 }
3631
3632 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3633                        int level, int type)
3634 {
3635         struct sock_exterr_skb *serr;
3636         struct sk_buff *skb;
3637         int copied, err;
3638
3639         err = -EAGAIN;
3640         skb = sock_dequeue_err_skb(sk);
3641         if (skb == NULL)
3642                 goto out;
3643
3644         copied = skb->len;
3645         if (copied > len) {
3646                 msg->msg_flags |= MSG_TRUNC;
3647                 copied = len;
3648         }
3649         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3650         if (err)
3651                 goto out_free_skb;
3652
3653         sock_recv_timestamp(msg, sk, skb);
3654
3655         serr = SKB_EXT_ERR(skb);
3656         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3657
3658         msg->msg_flags |= MSG_ERRQUEUE;
3659         err = copied;
3660
3661 out_free_skb:
3662         kfree_skb(skb);
3663 out:
3664         return err;
3665 }
3666 EXPORT_SYMBOL(sock_recv_errqueue);
3667
3668 /*
3669  *      Get a socket option on an socket.
3670  *
3671  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3672  *      asynchronous errors should be reported by getsockopt. We assume
3673  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3674  */
3675 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3676                            char __user *optval, int __user *optlen)
3677 {
3678         struct sock *sk = sock->sk;
3679
3680         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3681         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3682 }
3683 EXPORT_SYMBOL(sock_common_getsockopt);
3684
3685 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3686                         int flags)
3687 {
3688         struct sock *sk = sock->sk;
3689         int addr_len = 0;
3690         int err;
3691
3692         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3693         if (err >= 0)
3694                 msg->msg_namelen = addr_len;
3695         return err;
3696 }
3697 EXPORT_SYMBOL(sock_common_recvmsg);
3698
3699 /*
3700  *      Set socket options on an inet socket.
3701  */
3702 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3703                            sockptr_t optval, unsigned int optlen)
3704 {
3705         struct sock *sk = sock->sk;
3706
3707         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3708         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3709 }
3710 EXPORT_SYMBOL(sock_common_setsockopt);
3711
3712 void sk_common_release(struct sock *sk)
3713 {
3714         if (sk->sk_prot->destroy)
3715                 sk->sk_prot->destroy(sk);
3716
3717         /*
3718          * Observation: when sk_common_release is called, processes have
3719          * no access to socket. But net still has.
3720          * Step one, detach it from networking:
3721          *
3722          * A. Remove from hash tables.
3723          */
3724
3725         sk->sk_prot->unhash(sk);
3726
3727         /*
3728          * In this point socket cannot receive new packets, but it is possible
3729          * that some packets are in flight because some CPU runs receiver and
3730          * did hash table lookup before we unhashed socket. They will achieve
3731          * receive queue and will be purged by socket destructor.
3732          *
3733          * Also we still have packets pending on receive queue and probably,
3734          * our own packets waiting in device queues. sock_destroy will drain
3735          * receive queue, but transmitted packets will delay socket destruction
3736          * until the last reference will be released.
3737          */
3738
3739         sock_orphan(sk);
3740
3741         xfrm_sk_free_policy(sk);
3742
3743         sock_put(sk);
3744 }
3745 EXPORT_SYMBOL(sk_common_release);
3746
3747 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3748 {
3749         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3750
3751         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3752         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3753         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3754         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3755         mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3756         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3757         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3758         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3759         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3760 }
3761
3762 #ifdef CONFIG_PROC_FS
3763 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3764
3765 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3766 {
3767         int cpu, idx = prot->inuse_idx;
3768         int res = 0;
3769
3770         for_each_possible_cpu(cpu)
3771                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3772
3773         return res >= 0 ? res : 0;
3774 }
3775 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3776
3777 int sock_inuse_get(struct net *net)
3778 {
3779         int cpu, res = 0;
3780
3781         for_each_possible_cpu(cpu)
3782                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3783
3784         return res;
3785 }
3786
3787 EXPORT_SYMBOL_GPL(sock_inuse_get);
3788
3789 static int __net_init sock_inuse_init_net(struct net *net)
3790 {
3791         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3792         if (net->core.prot_inuse == NULL)
3793                 return -ENOMEM;
3794         return 0;
3795 }
3796
3797 static void __net_exit sock_inuse_exit_net(struct net *net)
3798 {
3799         free_percpu(net->core.prot_inuse);
3800 }
3801
3802 static struct pernet_operations net_inuse_ops = {
3803         .init = sock_inuse_init_net,
3804         .exit = sock_inuse_exit_net,
3805 };
3806
3807 static __init int net_inuse_init(void)
3808 {
3809         if (register_pernet_subsys(&net_inuse_ops))
3810                 panic("Cannot initialize net inuse counters");
3811
3812         return 0;
3813 }
3814
3815 core_initcall(net_inuse_init);
3816
3817 static int assign_proto_idx(struct proto *prot)
3818 {
3819         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3820
3821         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3822                 pr_err("PROTO_INUSE_NR exhausted\n");
3823                 return -ENOSPC;
3824         }
3825
3826         set_bit(prot->inuse_idx, proto_inuse_idx);
3827         return 0;
3828 }
3829
3830 static void release_proto_idx(struct proto *prot)
3831 {
3832         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3833                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3834 }
3835 #else
3836 static inline int assign_proto_idx(struct proto *prot)
3837 {
3838         return 0;
3839 }
3840
3841 static inline void release_proto_idx(struct proto *prot)
3842 {
3843 }
3844
3845 #endif
3846
3847 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3848 {
3849         if (!twsk_prot)
3850                 return;
3851         kfree(twsk_prot->twsk_slab_name);
3852         twsk_prot->twsk_slab_name = NULL;
3853         kmem_cache_destroy(twsk_prot->twsk_slab);
3854         twsk_prot->twsk_slab = NULL;
3855 }
3856
3857 static int tw_prot_init(const struct proto *prot)
3858 {
3859         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3860
3861         if (!twsk_prot)
3862                 return 0;
3863
3864         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3865                                               prot->name);
3866         if (!twsk_prot->twsk_slab_name)
3867                 return -ENOMEM;
3868
3869         twsk_prot->twsk_slab =
3870                 kmem_cache_create(twsk_prot->twsk_slab_name,
3871                                   twsk_prot->twsk_obj_size, 0,
3872                                   SLAB_ACCOUNT | prot->slab_flags,
3873                                   NULL);
3874         if (!twsk_prot->twsk_slab) {
3875                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3876                         prot->name);
3877                 return -ENOMEM;
3878         }
3879
3880         return 0;
3881 }
3882
3883 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3884 {
3885         if (!rsk_prot)
3886                 return;
3887         kfree(rsk_prot->slab_name);
3888         rsk_prot->slab_name = NULL;
3889         kmem_cache_destroy(rsk_prot->slab);
3890         rsk_prot->slab = NULL;
3891 }
3892
3893 static int req_prot_init(const struct proto *prot)
3894 {
3895         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3896
3897         if (!rsk_prot)
3898                 return 0;
3899
3900         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3901                                         prot->name);
3902         if (!rsk_prot->slab_name)
3903                 return -ENOMEM;
3904
3905         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3906                                            rsk_prot->obj_size, 0,
3907                                            SLAB_ACCOUNT | prot->slab_flags,
3908                                            NULL);
3909
3910         if (!rsk_prot->slab) {
3911                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3912                         prot->name);
3913                 return -ENOMEM;
3914         }
3915         return 0;
3916 }
3917
3918 int proto_register(struct proto *prot, int alloc_slab)
3919 {
3920         int ret = -ENOBUFS;
3921
3922         if (prot->memory_allocated && !prot->sysctl_mem) {
3923                 pr_err("%s: missing sysctl_mem\n", prot->name);
3924                 return -EINVAL;
3925         }
3926         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3927                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3928                 return -EINVAL;
3929         }
3930         if (alloc_slab) {
3931                 prot->slab = kmem_cache_create_usercopy(prot->name,
3932                                         prot->obj_size, 0,
3933                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3934                                         prot->slab_flags,
3935                                         prot->useroffset, prot->usersize,
3936                                         NULL);
3937
3938                 if (prot->slab == NULL) {
3939                         pr_crit("%s: Can't create sock SLAB cache!\n",
3940                                 prot->name);
3941                         goto out;
3942                 }
3943
3944                 if (req_prot_init(prot))
3945                         goto out_free_request_sock_slab;
3946
3947                 if (tw_prot_init(prot))
3948                         goto out_free_timewait_sock_slab;
3949         }
3950
3951         mutex_lock(&proto_list_mutex);
3952         ret = assign_proto_idx(prot);
3953         if (ret) {
3954                 mutex_unlock(&proto_list_mutex);
3955                 goto out_free_timewait_sock_slab;
3956         }
3957         list_add(&prot->node, &proto_list);
3958         mutex_unlock(&proto_list_mutex);
3959         return ret;
3960
3961 out_free_timewait_sock_slab:
3962         if (alloc_slab)
3963                 tw_prot_cleanup(prot->twsk_prot);
3964 out_free_request_sock_slab:
3965         if (alloc_slab) {
3966                 req_prot_cleanup(prot->rsk_prot);
3967
3968                 kmem_cache_destroy(prot->slab);
3969                 prot->slab = NULL;
3970         }
3971 out:
3972         return ret;
3973 }
3974 EXPORT_SYMBOL(proto_register);
3975
3976 void proto_unregister(struct proto *prot)
3977 {
3978         mutex_lock(&proto_list_mutex);
3979         release_proto_idx(prot);
3980         list_del(&prot->node);
3981         mutex_unlock(&proto_list_mutex);
3982
3983         kmem_cache_destroy(prot->slab);
3984         prot->slab = NULL;
3985
3986         req_prot_cleanup(prot->rsk_prot);
3987         tw_prot_cleanup(prot->twsk_prot);
3988 }
3989 EXPORT_SYMBOL(proto_unregister);
3990
3991 int sock_load_diag_module(int family, int protocol)
3992 {
3993         if (!protocol) {
3994                 if (!sock_is_registered(family))
3995                         return -ENOENT;
3996
3997                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3998                                       NETLINK_SOCK_DIAG, family);
3999         }
4000
4001 #ifdef CONFIG_INET
4002         if (family == AF_INET &&
4003             protocol != IPPROTO_RAW &&
4004             protocol < MAX_INET_PROTOS &&
4005             !rcu_access_pointer(inet_protos[protocol]))
4006                 return -ENOENT;
4007 #endif
4008
4009         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4010                               NETLINK_SOCK_DIAG, family, protocol);
4011 }
4012 EXPORT_SYMBOL(sock_load_diag_module);
4013
4014 #ifdef CONFIG_PROC_FS
4015 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4016         __acquires(proto_list_mutex)
4017 {
4018         mutex_lock(&proto_list_mutex);
4019         return seq_list_start_head(&proto_list, *pos);
4020 }
4021
4022 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4023 {
4024         return seq_list_next(v, &proto_list, pos);
4025 }
4026
4027 static void proto_seq_stop(struct seq_file *seq, void *v)
4028         __releases(proto_list_mutex)
4029 {
4030         mutex_unlock(&proto_list_mutex);
4031 }
4032
4033 static char proto_method_implemented(const void *method)
4034 {
4035         return method == NULL ? 'n' : 'y';
4036 }
4037 static long sock_prot_memory_allocated(struct proto *proto)
4038 {
4039         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4040 }
4041
4042 static const char *sock_prot_memory_pressure(struct proto *proto)
4043 {
4044         return proto->memory_pressure != NULL ?
4045         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4046 }
4047
4048 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4049 {
4050
4051         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4052                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4053                    proto->name,
4054                    proto->obj_size,
4055                    sock_prot_inuse_get(seq_file_net(seq), proto),
4056                    sock_prot_memory_allocated(proto),
4057                    sock_prot_memory_pressure(proto),
4058                    proto->max_header,
4059                    proto->slab == NULL ? "no" : "yes",
4060                    module_name(proto->owner),
4061                    proto_method_implemented(proto->close),
4062                    proto_method_implemented(proto->connect),
4063                    proto_method_implemented(proto->disconnect),
4064                    proto_method_implemented(proto->accept),
4065                    proto_method_implemented(proto->ioctl),
4066                    proto_method_implemented(proto->init),
4067                    proto_method_implemented(proto->destroy),
4068                    proto_method_implemented(proto->shutdown),
4069                    proto_method_implemented(proto->setsockopt),
4070                    proto_method_implemented(proto->getsockopt),
4071                    proto_method_implemented(proto->sendmsg),
4072                    proto_method_implemented(proto->recvmsg),
4073                    proto_method_implemented(proto->bind),
4074                    proto_method_implemented(proto->backlog_rcv),
4075                    proto_method_implemented(proto->hash),
4076                    proto_method_implemented(proto->unhash),
4077                    proto_method_implemented(proto->get_port),
4078                    proto_method_implemented(proto->enter_memory_pressure));
4079 }
4080
4081 static int proto_seq_show(struct seq_file *seq, void *v)
4082 {
4083         if (v == &proto_list)
4084                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4085                            "protocol",
4086                            "size",
4087                            "sockets",
4088                            "memory",
4089                            "press",
4090                            "maxhdr",
4091                            "slab",
4092                            "module",
4093                            "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4094         else
4095                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4096         return 0;
4097 }
4098
4099 static const struct seq_operations proto_seq_ops = {
4100         .start  = proto_seq_start,
4101         .next   = proto_seq_next,
4102         .stop   = proto_seq_stop,
4103         .show   = proto_seq_show,
4104 };
4105
4106 static __net_init int proto_init_net(struct net *net)
4107 {
4108         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4109                         sizeof(struct seq_net_private)))
4110                 return -ENOMEM;
4111
4112         return 0;
4113 }
4114
4115 static __net_exit void proto_exit_net(struct net *net)
4116 {
4117         remove_proc_entry("protocols", net->proc_net);
4118 }
4119
4120
4121 static __net_initdata struct pernet_operations proto_net_ops = {
4122         .init = proto_init_net,
4123         .exit = proto_exit_net,
4124 };
4125
4126 static int __init proto_init(void)
4127 {
4128         return register_pernet_subsys(&proto_net_ops);
4129 }
4130
4131 subsys_initcall(proto_init);
4132
4133 #endif /* PROC_FS */
4134
4135 #ifdef CONFIG_NET_RX_BUSY_POLL
4136 bool sk_busy_loop_end(void *p, unsigned long start_time)
4137 {
4138         struct sock *sk = p;
4139
4140         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4141                 return true;
4142
4143         if (sk_is_udp(sk) &&
4144             !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4145                 return true;
4146
4147         return sk_busy_loop_timeout(sk, start_time);
4148 }
4149 EXPORT_SYMBOL(sk_busy_loop_end);
4150 #endif /* CONFIG_NET_RX_BUSY_POLL */
4151
4152 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4153 {
4154         if (!sk->sk_prot->bind_add)
4155                 return -EOPNOTSUPP;
4156         return sk->sk_prot->bind_add(sk, addr, addr_len);
4157 }
4158 EXPORT_SYMBOL(sock_bind_add);
4159
4160 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4161 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4162                      void __user *arg, void *karg, size_t size)
4163 {
4164         int ret;
4165
4166         if (copy_from_user(karg, arg, size))
4167                 return -EFAULT;
4168
4169         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4170         if (ret)
4171                 return ret;
4172
4173         if (copy_to_user(arg, karg, size))
4174                 return -EFAULT;
4175
4176         return 0;
4177 }
4178 EXPORT_SYMBOL(sock_ioctl_inout);
4179
4180 /* This is the most common ioctl prep function, where the result (4 bytes) is
4181  * copied back to userspace if the ioctl() returns successfully. No input is
4182  * copied from userspace as input argument.
4183  */
4184 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4185 {
4186         int ret, karg = 0;
4187
4188         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4189         if (ret)
4190                 return ret;
4191
4192         return put_user(karg, (int __user *)arg);
4193 }
4194
4195 /* A wrapper around sock ioctls, which copies the data from userspace
4196  * (depending on the protocol/ioctl), and copies back the result to userspace.
4197  * The main motivation for this function is to pass kernel memory to the
4198  * protocol ioctl callbacks, instead of userspace memory.
4199  */
4200 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4201 {
4202         int rc = 1;
4203
4204         if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4205                 rc = ipmr_sk_ioctl(sk, cmd, arg);
4206         else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4207                 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4208         else if (sk_is_phonet(sk))
4209                 rc = phonet_sk_ioctl(sk, cmd, arg);
4210
4211         /* If ioctl was processed, returns its value */
4212         if (rc <= 0)
4213                 return rc;
4214
4215         /* Otherwise call the default handler */
4216         return sock_ioctl_out(sk, cmd, arg);
4217 }
4218 EXPORT_SYMBOL(sk_ioctl);