net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117 #include <linux/mroute.h>
 118 #include <linux/mroute6.h>
 119 #include <linux/icmpv6.h>
 120
 121 #include <linux/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138 #include <net/bpf_sk_storage.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144 #include <net/phonet/phonet.h>
 145
 146 #include <linux/ethtool.h>
 147
 148 #include "dev.h"
 149
 150 static DEFINE_MUTEX(proto_list_mutex);
 151 static LIST_HEAD(proto_list);
 152
 153 static void sock_def_write_space_wfree(struct sock *sk);
 154 static void sock_def_write_space(struct sock *sk);
 155
 156 /**
 157  * sk_ns_capable - General socket capability test
 158  * @sk: Socket to use a capability on or through
 159  * @user_ns: The user namespace of the capability to use
 160  * @cap: The capability to use
 161  *
 162  * Test to see if the opener of the socket had when the socket was
 163  * created and the current process has the capability @cap in the user
 164  * namespace @user_ns.
 165  */
 166 bool sk_ns_capable(const struct sock *sk,
 167                    struct user_namespace *user_ns, int cap)
 168 {
 169         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 170                 ns_capable(user_ns, cap);
 171 }
 172 EXPORT_SYMBOL(sk_ns_capable);
 173
 174 /**
 175  * sk_capable - Socket global capability test
 176  * @sk: Socket to use a capability on or through
 177  * @cap: The global capability to use
 178  *
 179  * Test to see if the opener of the socket had when the socket was
 180  * created and the current process has the capability @cap in all user
 181  * namespaces.
 182  */
 183 bool sk_capable(const struct sock *sk, int cap)
 184 {
 185         return sk_ns_capable(sk, &init_user_ns, cap);
 186 }
 187 EXPORT_SYMBOL(sk_capable);
 188
 189 /**
 190  * sk_net_capable - Network namespace socket capability test
 191  * @sk: Socket to use a capability on or through
 192  * @cap: The capability to use
 193  *
 194  * Test to see if the opener of the socket had when the socket was created
 195  * and the current process has the capability @cap over the network namespace
 196  * the socket is a member of.
 197  */
 198 bool sk_net_capable(const struct sock *sk, int cap)
 199 {
 200         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 201 }
 202 EXPORT_SYMBOL(sk_net_capable);
 203
 204 /*
 205  * Each address family might have different locking rules, so we have
 206  * one slock key per address family and separate keys for internal and
 207  * userspace sockets.
 208  */
 209 static struct lock_class_key af_family_keys[AF_MAX];
 210 static struct lock_class_key af_family_kern_keys[AF_MAX];
 211 static struct lock_class_key af_family_slock_keys[AF_MAX];
 212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 213
 214 /*
 215  * Make lock validator output more readable. (we pre-construct these
 216  * strings build-time, so that runtime initialization of socket
 217  * locks is fast):
 218  */
 219
 220 #define _sock_locks(x)                                            \
 221   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 222   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 223   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 224   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 225   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 226   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 227   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 228   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 229   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 230   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 231   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 232   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 233   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 234   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 235   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 236   x "AF_MCTP"  , \
 237   x "AF_MAX"
 238
 239 static const char *const af_family_key_strings[AF_MAX+1] = {
 240         _sock_locks("sk_lock-")
 241 };
 242 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("slock-")
 244 };
 245 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("clock-")
 247 };
 248
 249 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-sk_lock-")
 251 };
 252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 253         _sock_locks("k-slock-")
 254 };
 255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 256         _sock_locks("k-clock-")
 257 };
 258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 259         _sock_locks("rlock-")
 260 };
 261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 262         _sock_locks("wlock-")
 263 };
 264 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 265         _sock_locks("elock-")
 266 };
 267
 268 /*
 269  * sk_callback_lock and sk queues locking rules are per-address-family,
 270  * so split the lock classes by using a per-AF key:
 271  */
 272 static struct lock_class_key af_callback_keys[AF_MAX];
 273 static struct lock_class_key af_rlock_keys[AF_MAX];
 274 static struct lock_class_key af_wlock_keys[AF_MAX];
 275 static struct lock_class_key af_elock_keys[AF_MAX];
 276 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 277
 278 /* Run time adjustable parameters. */
 279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280 EXPORT_SYMBOL(sysctl_wmem_max);
 281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282 EXPORT_SYMBOL(sysctl_rmem_max);
 283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286 /* Maximal space eaten by iovec or ancillary data plus some space */
 287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288 EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290 int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 293 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 294
 295 /**
 296  * sk_set_memalloc - sets %SOCK_MEMALLOC
 297  * @sk: socket to set it on
 298  *
 299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300  * It's the responsibility of the admin to adjust min_free_kbytes
 301  * to meet the requirements
 302  */
 303 void sk_set_memalloc(struct sock *sk)
 304 {
 305         sock_set_flag(sk, SOCK_MEMALLOC);
 306         sk->sk_allocation |= __GFP_MEMALLOC;
 307         static_branch_inc(&memalloc_socks_key);
 308 }
 309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311 void sk_clear_memalloc(struct sock *sk)
 312 {
 313         sock_reset_flag(sk, SOCK_MEMALLOC);
 314         sk->sk_allocation &= ~__GFP_MEMALLOC;
 315         static_branch_dec(&memalloc_socks_key);
 316
 317         /*
 318          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319          * progress of swapping. SOCK_MEMALLOC may be cleared while
 320          * it has rmem allocations due to the last swapfile being deactivated
 321          * but there is a risk that the socket is unusable due to exceeding
 322          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323          */
 324         sk_mem_reclaim(sk);
 325 }
 326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329 {
 330         int ret;
 331         unsigned int noreclaim_flag;
 332
 333         /* these should have been dropped before queueing */
 334         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336         noreclaim_flag = memalloc_noreclaim_save();
 337         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 338                                  tcp_v6_do_rcv,
 339                                  tcp_v4_do_rcv,
 340                                  sk, skb);
 341         memalloc_noreclaim_restore(noreclaim_flag);
 342
 343         return ret;
 344 }
 345 EXPORT_SYMBOL(__sk_backlog_rcv);
 346
 347 void sk_error_report(struct sock *sk)
 348 {
 349         sk->sk_error_report(sk);
 350
 351         switch (sk->sk_family) {
 352         case AF_INET:
 353                 fallthrough;
 354         case AF_INET6:
 355                 trace_inet_sk_error_report(sk);
 356                 break;
 357         default:
 358                 break;
 359         }
 360 }
 361 EXPORT_SYMBOL(sk_error_report);
 362
 363 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 364 {
 365         struct __kernel_sock_timeval tv;
 366
 367         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 368                 tv.tv_sec = 0;
 369                 tv.tv_usec = 0;
 370         } else {
 371                 tv.tv_sec = timeo / HZ;
 372                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 373         }
 374
 375         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 376                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 377                 *(struct old_timeval32 *)optval = tv32;
 378                 return sizeof(tv32);
 379         }
 380
 381         if (old_timeval) {
 382                 struct __kernel_old_timeval old_tv;
 383                 old_tv.tv_sec = tv.tv_sec;
 384                 old_tv.tv_usec = tv.tv_usec;
 385                 *(struct __kernel_old_timeval *)optval = old_tv;
 386                 return sizeof(old_tv);
 387         }
 388
 389         *(struct __kernel_sock_timeval *)optval = tv;
 390         return sizeof(tv);
 391 }
 392 EXPORT_SYMBOL(sock_get_timeout);
 393
 394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 395                            sockptr_t optval, int optlen, bool old_timeval)
 396 {
 397         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 398                 struct old_timeval32 tv32;
 399
 400                 if (optlen < sizeof(tv32))
 401                         return -EINVAL;
 402
 403                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 404                         return -EFAULT;
 405                 tv->tv_sec = tv32.tv_sec;
 406                 tv->tv_usec = tv32.tv_usec;
 407         } else if (old_timeval) {
 408                 struct __kernel_old_timeval old_tv;
 409
 410                 if (optlen < sizeof(old_tv))
 411                         return -EINVAL;
 412                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 413                         return -EFAULT;
 414                 tv->tv_sec = old_tv.tv_sec;
 415                 tv->tv_usec = old_tv.tv_usec;
 416         } else {
 417                 if (optlen < sizeof(*tv))
 418                         return -EINVAL;
 419                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 420                         return -EFAULT;
 421         }
 422
 423         return 0;
 424 }
 425 EXPORT_SYMBOL(sock_copy_user_timeval);
 426
 427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 428                             bool old_timeval)
 429 {
 430         struct __kernel_sock_timeval tv;
 431         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 432         long val;
 433
 434         if (err)
 435                 return err;
 436
 437         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 438                 return -EDOM;
 439
 440         if (tv.tv_sec < 0) {
 441                 static int warned __read_mostly;
 442
 443                 WRITE_ONCE(*timeo_p, 0);
 444                 if (warned < 10 && net_ratelimit()) {
 445                         warned++;
 446                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 447                                 __func__, current->comm, task_pid_nr(current));
 448                 }
 449                 return 0;
 450         }
 451         val = MAX_SCHEDULE_TIMEOUT;
 452         if ((tv.tv_sec || tv.tv_usec) &&
 453             (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 454                 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 455                                                     USEC_PER_SEC / HZ);
 456         WRITE_ONCE(*timeo_p, val);
 457         return 0;
 458 }
 459
 460 static bool sock_needs_netstamp(const struct sock *sk)
 461 {
 462         switch (sk->sk_family) {
 463         case AF_UNSPEC:
 464         case AF_UNIX:
 465                 return false;
 466         default:
 467                 return true;
 468         }
 469 }
 470
 471 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 472 {
 473         if (sk->sk_flags & flags) {
 474                 sk->sk_flags &= ~flags;
 475                 if (sock_needs_netstamp(sk) &&
 476                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 477                         net_disable_timestamp();
 478         }
 479 }
 480
 481
 482 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 483 {
 484         unsigned long flags;
 485         struct sk_buff_head *list = &sk->sk_receive_queue;
 486
 487         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 488                 atomic_inc(&sk->sk_drops);
 489                 trace_sock_rcvqueue_full(sk, skb);
 490                 return -ENOMEM;
 491         }
 492
 493         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 494                 atomic_inc(&sk->sk_drops);
 495                 return -ENOBUFS;
 496         }
 497
 498         skb->dev = NULL;
 499         skb_set_owner_r(skb, sk);
 500
 501         /* we escape from rcu protected region, make sure we dont leak
 502          * a norefcounted dst
 503          */
 504         skb_dst_force(skb);
 505
 506         spin_lock_irqsave(&list->lock, flags);
 507         sock_skb_set_dropcount(sk, skb);
 508         __skb_queue_tail(list, skb);
 509         spin_unlock_irqrestore(&list->lock, flags);
 510
 511         if (!sock_flag(sk, SOCK_DEAD))
 512                 sk->sk_data_ready(sk);
 513         return 0;
 514 }
 515 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 516
 517 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 518                               enum skb_drop_reason *reason)
 519 {
 520         enum skb_drop_reason drop_reason;
 521         int err;
 522
 523         err = sk_filter(sk, skb);
 524         if (err) {
 525                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 526                 goto out;
 527         }
 528         err = __sock_queue_rcv_skb(sk, skb);
 529         switch (err) {
 530         case -ENOMEM:
 531                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 532                 break;
 533         case -ENOBUFS:
 534                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 535                 break;
 536         default:
 537                 drop_reason = SKB_NOT_DROPPED_YET;
 538                 break;
 539         }
 540 out:
 541         if (reason)
 542                 *reason = drop_reason;
 543         return err;
 544 }
 545 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 546
 547 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 548                      const int nested, unsigned int trim_cap, bool refcounted)
 549 {
 550         int rc = NET_RX_SUCCESS;
 551
 552         if (sk_filter_trim_cap(sk, skb, trim_cap))
 553                 goto discard_and_relse;
 554
 555         skb->dev = NULL;
 556
 557         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 558                 atomic_inc(&sk->sk_drops);
 559                 goto discard_and_relse;
 560         }
 561         if (nested)
 562                 bh_lock_sock_nested(sk);
 563         else
 564                 bh_lock_sock(sk);
 565         if (!sock_owned_by_user(sk)) {
 566                 /*
 567                  * trylock + unlock semantics:
 568                  */
 569                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 570
 571                 rc = sk_backlog_rcv(sk, skb);
 572
 573                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 574         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 575                 bh_unlock_sock(sk);
 576                 atomic_inc(&sk->sk_drops);
 577                 goto discard_and_relse;
 578         }
 579
 580         bh_unlock_sock(sk);
 581 out:
 582         if (refcounted)
 583                 sock_put(sk);
 584         return rc;
 585 discard_and_relse:
 586         kfree_skb(skb);
 587         goto out;
 588 }
 589 EXPORT_SYMBOL(__sk_receive_skb);
 590
 591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 592                                                           u32));
 593 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 594                                                            u32));
 595 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 596 {
 597         struct dst_entry *dst = __sk_dst_get(sk);
 598
 599         if (dst && dst->obsolete &&
 600             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 601                                dst, cookie) == NULL) {
 602                 sk_tx_queue_clear(sk);
 603                 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 604                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 605                 dst_release(dst);
 606                 return NULL;
 607         }
 608
 609         return dst;
 610 }
 611 EXPORT_SYMBOL(__sk_dst_check);
 612
 613 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 614 {
 615         struct dst_entry *dst = sk_dst_get(sk);
 616
 617         if (dst && dst->obsolete &&
 618             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 619                                dst, cookie) == NULL) {
 620                 sk_dst_reset(sk);
 621                 dst_release(dst);
 622                 return NULL;
 623         }
 624
 625         return dst;
 626 }
 627 EXPORT_SYMBOL(sk_dst_check);
 628
 629 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 630 {
 631         int ret = -ENOPROTOOPT;
 632 #ifdef CONFIG_NETDEVICES
 633         struct net *net = sock_net(sk);
 634
 635         /* Sorry... */
 636         ret = -EPERM;
 637         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 638                 goto out;
 639
 640         ret = -EINVAL;
 641         if (ifindex < 0)
 642                 goto out;
 643
 644         /* Paired with all READ_ONCE() done locklessly. */
 645         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 646
 647         if (sk->sk_prot->rehash)
 648                 sk->sk_prot->rehash(sk);
 649         sk_dst_reset(sk);
 650
 651         ret = 0;
 652
 653 out:
 654 #endif
 655
 656         return ret;
 657 }
 658
 659 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 660 {
 661         int ret;
 662
 663         if (lock_sk)
 664                 lock_sock(sk);
 665         ret = sock_bindtoindex_locked(sk, ifindex);
 666         if (lock_sk)
 667                 release_sock(sk);
 668
 669         return ret;
 670 }
 671 EXPORT_SYMBOL(sock_bindtoindex);
 672
 673 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 674 {
 675         int ret = -ENOPROTOOPT;
 676 #ifdef CONFIG_NETDEVICES
 677         struct net *net = sock_net(sk);
 678         char devname[IFNAMSIZ];
 679         int index;
 680
 681         ret = -EINVAL;
 682         if (optlen < 0)
 683                 goto out;
 684
 685         /* Bind this socket to a particular device like "eth0",
 686          * as specified in the passed interface name. If the
 687          * name is "" or the option length is zero the socket
 688          * is not bound.
 689          */
 690         if (optlen > IFNAMSIZ - 1)
 691                 optlen = IFNAMSIZ - 1;
 692         memset(devname, 0, sizeof(devname));
 693
 694         ret = -EFAULT;
 695         if (copy_from_sockptr(devname, optval, optlen))
 696                 goto out;
 697
 698         index = 0;
 699         if (devname[0] != '\0') {
 700                 struct net_device *dev;
 701
 702                 rcu_read_lock();
 703                 dev = dev_get_by_name_rcu(net, devname);
 704                 if (dev)
 705                         index = dev->ifindex;
 706                 rcu_read_unlock();
 707                 ret = -ENODEV;
 708                 if (!dev)
 709                         goto out;
 710         }
 711
 712         sockopt_lock_sock(sk);
 713         ret = sock_bindtoindex_locked(sk, index);
 714         sockopt_release_sock(sk);
 715 out:
 716 #endif
 717
 718         return ret;
 719 }
 720
 721 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 722                                 sockptr_t optlen, int len)
 723 {
 724         int ret = -ENOPROTOOPT;
 725 #ifdef CONFIG_NETDEVICES
 726         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 727         struct net *net = sock_net(sk);
 728         char devname[IFNAMSIZ];
 729
 730         if (bound_dev_if == 0) {
 731                 len = 0;
 732                 goto zero;
 733         }
 734
 735         ret = -EINVAL;
 736         if (len < IFNAMSIZ)
 737                 goto out;
 738
 739         ret = netdev_get_name(net, devname, bound_dev_if);
 740         if (ret)
 741                 goto out;
 742
 743         len = strlen(devname) + 1;
 744
 745         ret = -EFAULT;
 746         if (copy_to_sockptr(optval, devname, len))
 747                 goto out;
 748
 749 zero:
 750         ret = -EFAULT;
 751         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 752                 goto out;
 753
 754         ret = 0;
 755
 756 out:
 757 #endif
 758
 759         return ret;
 760 }
 761
 762 bool sk_mc_loop(struct sock *sk)
 763 {
 764         if (dev_recursion_level())
 765                 return false;
 766         if (!sk)
 767                 return true;
 768         /* IPV6_ADDRFORM can change sk->sk_family under us. */
 769         switch (READ_ONCE(sk->sk_family)) {
 770         case AF_INET:
 771                 return inet_test_bit(MC_LOOP, sk);
 772 #if IS_ENABLED(CONFIG_IPV6)
 773         case AF_INET6:
 774                 return inet6_sk(sk)->mc_loop;
 775 #endif
 776         }
 777         WARN_ON_ONCE(1);
 778         return true;
 779 }
 780 EXPORT_SYMBOL(sk_mc_loop);
 781
 782 void sock_set_reuseaddr(struct sock *sk)
 783 {
 784         lock_sock(sk);
 785         sk->sk_reuse = SK_CAN_REUSE;
 786         release_sock(sk);
 787 }
 788 EXPORT_SYMBOL(sock_set_reuseaddr);
 789
 790 void sock_set_reuseport(struct sock *sk)
 791 {
 792         lock_sock(sk);
 793         sk->sk_reuseport = true;
 794         release_sock(sk);
 795 }
 796 EXPORT_SYMBOL(sock_set_reuseport);
 797
 798 void sock_no_linger(struct sock *sk)
 799 {
 800         lock_sock(sk);
 801         WRITE_ONCE(sk->sk_lingertime, 0);
 802         sock_set_flag(sk, SOCK_LINGER);
 803         release_sock(sk);
 804 }
 805 EXPORT_SYMBOL(sock_no_linger);
 806
 807 void sock_set_priority(struct sock *sk, u32 priority)
 808 {
 809         lock_sock(sk);
 810         WRITE_ONCE(sk->sk_priority, priority);
 811         release_sock(sk);
 812 }
 813 EXPORT_SYMBOL(sock_set_priority);
 814
 815 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 816 {
 817         lock_sock(sk);
 818         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 819                 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 820         else
 821                 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 822         release_sock(sk);
 823 }
 824 EXPORT_SYMBOL(sock_set_sndtimeo);
 825
 826 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 827 {
 828         if (val)  {
 829                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 830                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 831                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 832                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 833         } else {
 834                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 835                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 836         }
 837 }
 838
 839 void sock_enable_timestamps(struct sock *sk)
 840 {
 841         lock_sock(sk);
 842         __sock_set_timestamps(sk, true, false, true);
 843         release_sock(sk);
 844 }
 845 EXPORT_SYMBOL(sock_enable_timestamps);
 846
 847 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 848 {
 849         switch (optname) {
 850         case SO_TIMESTAMP_OLD:
 851                 __sock_set_timestamps(sk, valbool, false, false);
 852                 break;
 853         case SO_TIMESTAMP_NEW:
 854                 __sock_set_timestamps(sk, valbool, true, false);
 855                 break;
 856         case SO_TIMESTAMPNS_OLD:
 857                 __sock_set_timestamps(sk, valbool, false, true);
 858                 break;
 859         case SO_TIMESTAMPNS_NEW:
 860                 __sock_set_timestamps(sk, valbool, true, true);
 861                 break;
 862         }
 863 }
 864
 865 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 866 {
 867         struct net *net = sock_net(sk);
 868         struct net_device *dev = NULL;
 869         bool match = false;
 870         int *vclock_index;
 871         int i, num;
 872
 873         if (sk->sk_bound_dev_if)
 874                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 875
 876         if (!dev) {
 877                 pr_err("%s: sock not bind to device\n", __func__);
 878                 return -EOPNOTSUPP;
 879         }
 880
 881         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 882         dev_put(dev);
 883
 884         for (i = 0; i < num; i++) {
 885                 if (*(vclock_index + i) == phc_index) {
 886                         match = true;
 887                         break;
 888                 }
 889         }
 890
 891         if (num > 0)
 892                 kfree(vclock_index);
 893
 894         if (!match)
 895                 return -EINVAL;
 896
 897         WRITE_ONCE(sk->sk_bind_phc, phc_index);
 898
 899         return 0;
 900 }
 901
 902 int sock_set_timestamping(struct sock *sk, int optname,
 903                           struct so_timestamping timestamping)
 904 {
 905         int val = timestamping.flags;
 906         int ret;
 907
 908         if (val & ~SOF_TIMESTAMPING_MASK)
 909                 return -EINVAL;
 910
 911         if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 912             !(val & SOF_TIMESTAMPING_OPT_ID))
 913                 return -EINVAL;
 914
 915         if (val & SOF_TIMESTAMPING_OPT_ID &&
 916             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 917                 if (sk_is_tcp(sk)) {
 918                         if ((1 << sk->sk_state) &
 919                             (TCPF_CLOSE | TCPF_LISTEN))
 920                                 return -EINVAL;
 921                         if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 922                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 923                         else
 924                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 925                 } else {
 926                         atomic_set(&sk->sk_tskey, 0);
 927                 }
 928         }
 929
 930         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 931             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 932                 return -EINVAL;
 933
 934         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 935                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 936                 if (ret)
 937                         return ret;
 938         }
 939
 940         WRITE_ONCE(sk->sk_tsflags, val);
 941         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 942
 943         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 944                 sock_enable_timestamp(sk,
 945                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 946         else
 947                 sock_disable_timestamp(sk,
 948                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 949         return 0;
 950 }
 951
 952 void sock_set_keepalive(struct sock *sk)
 953 {
 954         lock_sock(sk);
 955         if (sk->sk_prot->keepalive)
 956                 sk->sk_prot->keepalive(sk, true);
 957         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 958         release_sock(sk);
 959 }
 960 EXPORT_SYMBOL(sock_set_keepalive);
 961
 962 static void __sock_set_rcvbuf(struct sock *sk, int val)
 963 {
 964         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 965          * as a negative value.
 966          */
 967         val = min_t(int, val, INT_MAX / 2);
 968         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 969
 970         /* We double it on the way in to account for "struct sk_buff" etc.
 971          * overhead.   Applications assume that the SO_RCVBUF setting they make
 972          * will allow that much actual data to be received on that socket.
 973          *
 974          * Applications are unaware that "struct sk_buff" and other overheads
 975          * allocate from the receive buffer during socket buffer allocation.
 976          *
 977          * And after considering the possible alternatives, returning the value
 978          * we actually used in getsockopt is the most desirable behavior.
 979          */
 980         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 981 }
 982
 983 void sock_set_rcvbuf(struct sock *sk, int val)
 984 {
 985         lock_sock(sk);
 986         __sock_set_rcvbuf(sk, val);
 987         release_sock(sk);
 988 }
 989 EXPORT_SYMBOL(sock_set_rcvbuf);
 990
 991 static void __sock_set_mark(struct sock *sk, u32 val)
 992 {
 993         if (val != sk->sk_mark) {
 994                 WRITE_ONCE(sk->sk_mark, val);
 995                 sk_dst_reset(sk);
 996         }
 997 }
 998
 999 void sock_set_mark(struct sock *sk, u32 val)
1000 {
1001         lock_sock(sk);
1002         __sock_set_mark(sk, val);
1003         release_sock(sk);
1004 }
1005 EXPORT_SYMBOL(sock_set_mark);
1006
1007 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1008 {
1009         /* Round down bytes to multiple of pages */
1010         bytes = round_down(bytes, PAGE_SIZE);
1011
1012         WARN_ON(bytes > sk->sk_reserved_mem);
1013         WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1014         sk_mem_reclaim(sk);
1015 }
1016
1017 static int sock_reserve_memory(struct sock *sk, int bytes)
1018 {
1019         long allocated;
1020         bool charged;
1021         int pages;
1022
1023         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1024                 return -EOPNOTSUPP;
1025
1026         if (!bytes)
1027                 return 0;
1028
1029         pages = sk_mem_pages(bytes);
1030
1031         /* pre-charge to memcg */
1032         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1033                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1034         if (!charged)
1035                 return -ENOMEM;
1036
1037         /* pre-charge to forward_alloc */
1038         sk_memory_allocated_add(sk, pages);
1039         allocated = sk_memory_allocated(sk);
1040         /* If the system goes into memory pressure with this
1041          * precharge, give up and return error.
1042          */
1043         if (allocated > sk_prot_mem_limits(sk, 1)) {
1044                 sk_memory_allocated_sub(sk, pages);
1045                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1046                 return -ENOMEM;
1047         }
1048         sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1049
1050         WRITE_ONCE(sk->sk_reserved_mem,
1051                    sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1052
1053         return 0;
1054 }
1055
1056 void sockopt_lock_sock(struct sock *sk)
1057 {
1058         /* When current->bpf_ctx is set, the setsockopt is called from
1059          * a bpf prog.  bpf has ensured the sk lock has been
1060          * acquired before calling setsockopt().
1061          */
1062         if (has_current_bpf_ctx())
1063                 return;
1064
1065         lock_sock(sk);
1066 }
1067 EXPORT_SYMBOL(sockopt_lock_sock);
1068
1069 void sockopt_release_sock(struct sock *sk)
1070 {
1071         if (has_current_bpf_ctx())
1072                 return;
1073
1074         release_sock(sk);
1075 }
1076 EXPORT_SYMBOL(sockopt_release_sock);
1077
1078 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1079 {
1080         return has_current_bpf_ctx() || ns_capable(ns, cap);
1081 }
1082 EXPORT_SYMBOL(sockopt_ns_capable);
1083
1084 bool sockopt_capable(int cap)
1085 {
1086         return has_current_bpf_ctx() || capable(cap);
1087 }
1088 EXPORT_SYMBOL(sockopt_capable);
1089
1090 /*
1091  *      This is meant for all protocols to use and covers goings on
1092  *      at the socket level. Everything here is generic.
1093  */
1094
1095 int sk_setsockopt(struct sock *sk, int level, int optname,
1096                   sockptr_t optval, unsigned int optlen)
1097 {
1098         struct so_timestamping timestamping;
1099         struct socket *sock = sk->sk_socket;
1100         struct sock_txtime sk_txtime;
1101         int val;
1102         int valbool;
1103         struct linger ling;
1104         int ret = 0;
1105
1106         /*
1107          *      Options without arguments
1108          */
1109
1110         if (optname == SO_BINDTODEVICE)
1111                 return sock_setbindtodevice(sk, optval, optlen);
1112
1113         if (optlen < sizeof(int))
1114                 return -EINVAL;
1115
1116         if (copy_from_sockptr(&val, optval, sizeof(val)))
1117                 return -EFAULT;
1118
1119         valbool = val ? 1 : 0;
1120
1121         sockopt_lock_sock(sk);
1122
1123         switch (optname) {
1124         case SO_DEBUG:
1125                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1126                         ret = -EACCES;
1127                 else
1128                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1129                 break;
1130         case SO_REUSEADDR:
1131                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1132                 break;
1133         case SO_REUSEPORT:
1134                 sk->sk_reuseport = valbool;
1135                 break;
1136         case SO_TYPE:
1137         case SO_PROTOCOL:
1138         case SO_DOMAIN:
1139         case SO_ERROR:
1140                 ret = -ENOPROTOOPT;
1141                 break;
1142         case SO_DONTROUTE:
1143                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1144                 sk_dst_reset(sk);
1145                 break;
1146         case SO_BROADCAST:
1147                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1148                 break;
1149         case SO_SNDBUF:
1150                 /* Don't error on this BSD doesn't and if you think
1151                  * about it this is right. Otherwise apps have to
1152                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1153                  * are treated in BSD as hints
1154                  */
1155                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1156 set_sndbuf:
1157                 /* Ensure val * 2 fits into an int, to prevent max_t()
1158                  * from treating it as a negative value.
1159                  */
1160                 val = min_t(int, val, INT_MAX / 2);
1161                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1162                 WRITE_ONCE(sk->sk_sndbuf,
1163                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1164                 /* Wake up sending tasks if we upped the value. */
1165                 sk->sk_write_space(sk);
1166                 break;
1167
1168         case SO_SNDBUFFORCE:
1169                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1170                         ret = -EPERM;
1171                         break;
1172                 }
1173
1174                 /* No negative values (to prevent underflow, as val will be
1175                  * multiplied by 2).
1176                  */
1177                 if (val < 0)
1178                         val = 0;
1179                 goto set_sndbuf;
1180
1181         case SO_RCVBUF:
1182                 /* Don't error on this BSD doesn't and if you think
1183                  * about it this is right. Otherwise apps have to
1184                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1185                  * are treated in BSD as hints
1186                  */
1187                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1188                 break;
1189
1190         case SO_RCVBUFFORCE:
1191                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1192                         ret = -EPERM;
1193                         break;
1194                 }
1195
1196                 /* No negative values (to prevent underflow, as val will be
1197                  * multiplied by 2).
1198                  */
1199                 __sock_set_rcvbuf(sk, max(val, 0));
1200                 break;
1201
1202         case SO_KEEPALIVE:
1203                 if (sk->sk_prot->keepalive)
1204                         sk->sk_prot->keepalive(sk, valbool);
1205                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1206                 break;
1207
1208         case SO_OOBINLINE:
1209                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1210                 break;
1211
1212         case SO_NO_CHECK:
1213                 sk->sk_no_check_tx = valbool;
1214                 break;
1215
1216         case SO_PRIORITY:
1217                 if ((val >= 0 && val <= 6) ||
1218                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1219                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1220                         WRITE_ONCE(sk->sk_priority, val);
1221                 else
1222                         ret = -EPERM;
1223                 break;
1224
1225         case SO_LINGER:
1226                 if (optlen < sizeof(ling)) {
1227                         ret = -EINVAL;  /* 1003.1g */
1228                         break;
1229                 }
1230                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1231                         ret = -EFAULT;
1232                         break;
1233                 }
1234                 if (!ling.l_onoff) {
1235                         sock_reset_flag(sk, SOCK_LINGER);
1236                 } else {
1237                         unsigned long t_sec = ling.l_linger;
1238
1239                         if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1240                                 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1241                         else
1242                                 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1243                         sock_set_flag(sk, SOCK_LINGER);
1244                 }
1245                 break;
1246
1247         case SO_BSDCOMPAT:
1248                 break;
1249
1250         case SO_PASSCRED:
1251                 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1252                 break;
1253
1254         case SO_PASSPIDFD:
1255                 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1256                 break;
1257
1258         case SO_TIMESTAMP_OLD:
1259         case SO_TIMESTAMP_NEW:
1260         case SO_TIMESTAMPNS_OLD:
1261         case SO_TIMESTAMPNS_NEW:
1262                 sock_set_timestamp(sk, optname, valbool);
1263                 break;
1264
1265         case SO_TIMESTAMPING_NEW:
1266         case SO_TIMESTAMPING_OLD:
1267                 if (optlen == sizeof(timestamping)) {
1268                         if (copy_from_sockptr(&timestamping, optval,
1269                                               sizeof(timestamping))) {
1270                                 ret = -EFAULT;
1271                                 break;
1272                         }
1273                 } else {
1274                         memset(&timestamping, 0, sizeof(timestamping));
1275                         timestamping.flags = val;
1276                 }
1277                 ret = sock_set_timestamping(sk, optname, timestamping);
1278                 break;
1279
1280         case SO_RCVLOWAT:
1281                 {
1282                 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1283
1284                 if (val < 0)
1285                         val = INT_MAX;
1286                 if (sock)
1287                         set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1288                 if (set_rcvlowat)
1289                         ret = set_rcvlowat(sk, val);
1290                 else
1291                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1292                 break;
1293                 }
1294         case SO_RCVTIMEO_OLD:
1295         case SO_RCVTIMEO_NEW:
1296                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1297                                        optlen, optname == SO_RCVTIMEO_OLD);
1298                 break;
1299
1300         case SO_SNDTIMEO_OLD:
1301         case SO_SNDTIMEO_NEW:
1302                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1303                                        optlen, optname == SO_SNDTIMEO_OLD);
1304                 break;
1305
1306         case SO_ATTACH_FILTER: {
1307                 struct sock_fprog fprog;
1308
1309                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1310                 if (!ret)
1311                         ret = sk_attach_filter(&fprog, sk);
1312                 break;
1313         }
1314         case SO_ATTACH_BPF:
1315                 ret = -EINVAL;
1316                 if (optlen == sizeof(u32)) {
1317                         u32 ufd;
1318
1319                         ret = -EFAULT;
1320                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1321                                 break;
1322
1323                         ret = sk_attach_bpf(ufd, sk);
1324                 }
1325                 break;
1326
1327         case SO_ATTACH_REUSEPORT_CBPF: {
1328                 struct sock_fprog fprog;
1329
1330                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1331                 if (!ret)
1332                         ret = sk_reuseport_attach_filter(&fprog, sk);
1333                 break;
1334         }
1335         case SO_ATTACH_REUSEPORT_EBPF:
1336                 ret = -EINVAL;
1337                 if (optlen == sizeof(u32)) {
1338                         u32 ufd;
1339
1340                         ret = -EFAULT;
1341                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1342                                 break;
1343
1344                         ret = sk_reuseport_attach_bpf(ufd, sk);
1345                 }
1346                 break;
1347
1348         case SO_DETACH_REUSEPORT_BPF:
1349                 ret = reuseport_detach_prog(sk);
1350                 break;
1351
1352         case SO_DETACH_FILTER:
1353                 ret = sk_detach_filter(sk);
1354                 break;
1355
1356         case SO_LOCK_FILTER:
1357                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1358                         ret = -EPERM;
1359                 else
1360                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1361                 break;
1362
1363         case SO_PASSSEC:
1364                 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1365                 break;
1366         case SO_MARK:
1367                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1368                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1369                         ret = -EPERM;
1370                         break;
1371                 }
1372
1373                 __sock_set_mark(sk, val);
1374                 break;
1375         case SO_RCVMARK:
1376                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1377                 break;
1378
1379         case SO_RXQ_OVFL:
1380                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1381                 break;
1382
1383         case SO_WIFI_STATUS:
1384                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1385                 break;
1386
1387         case SO_PEEK_OFF:
1388                 {
1389                 int (*set_peek_off)(struct sock *sk, int val);
1390
1391                 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1392                 if (set_peek_off)
1393                         ret = set_peek_off(sk, val);
1394                 else
1395                         ret = -EOPNOTSUPP;
1396                 break;
1397                 }
1398
1399         case SO_NOFCS:
1400                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1401                 break;
1402
1403         case SO_SELECT_ERR_QUEUE:
1404                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1405                 break;
1406
1407 #ifdef CONFIG_NET_RX_BUSY_POLL
1408         case SO_BUSY_POLL:
1409                 if (val < 0)
1410                         ret = -EINVAL;
1411                 else
1412                         WRITE_ONCE(sk->sk_ll_usec, val);
1413                 break;
1414         case SO_PREFER_BUSY_POLL:
1415                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1416                         ret = -EPERM;
1417                 else
1418                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1419                 break;
1420         case SO_BUSY_POLL_BUDGET:
1421                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1422                         ret = -EPERM;
1423                 } else {
1424                         if (val < 0 || val > U16_MAX)
1425                                 ret = -EINVAL;
1426                         else
1427                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1428                 }
1429                 break;
1430 #endif
1431
1432         case SO_MAX_PACING_RATE:
1433                 {
1434                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1435
1436                 if (sizeof(ulval) != sizeof(val) &&
1437                     optlen >= sizeof(ulval) &&
1438                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1439                         ret = -EFAULT;
1440                         break;
1441                 }
1442                 if (ulval != ~0UL)
1443                         cmpxchg(&sk->sk_pacing_status,
1444                                 SK_PACING_NONE,
1445                                 SK_PACING_NEEDED);
1446                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1447                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1448                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1449                 break;
1450                 }
1451         case SO_INCOMING_CPU:
1452                 reuseport_update_incoming_cpu(sk, val);
1453                 break;
1454
1455         case SO_CNX_ADVICE:
1456                 if (val == 1)
1457                         dst_negative_advice(sk);
1458                 break;
1459
1460         case SO_ZEROCOPY:
1461                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1462                         if (!(sk_is_tcp(sk) ||
1463                               (sk->sk_type == SOCK_DGRAM &&
1464                                sk->sk_protocol == IPPROTO_UDP)))
1465                                 ret = -EOPNOTSUPP;
1466                 } else if (sk->sk_family != PF_RDS) {
1467                         ret = -EOPNOTSUPP;
1468                 }
1469                 if (!ret) {
1470                         if (val < 0 || val > 1)
1471                                 ret = -EINVAL;
1472                         else
1473                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1474                 }
1475                 break;
1476
1477         case SO_TXTIME:
1478                 if (optlen != sizeof(struct sock_txtime)) {
1479                         ret = -EINVAL;
1480                         break;
1481                 } else if (copy_from_sockptr(&sk_txtime, optval,
1482                            sizeof(struct sock_txtime))) {
1483                         ret = -EFAULT;
1484                         break;
1485                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1486                         ret = -EINVAL;
1487                         break;
1488                 }
1489                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1490                  * scheduler has enough safe guards.
1491                  */
1492                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1493                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1494                         ret = -EPERM;
1495                         break;
1496                 }
1497                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1498                 sk->sk_clockid = sk_txtime.clockid;
1499                 sk->sk_txtime_deadline_mode =
1500                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1501                 sk->sk_txtime_report_errors =
1502                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1503                 break;
1504
1505         case SO_BINDTOIFINDEX:
1506                 ret = sock_bindtoindex_locked(sk, val);
1507                 break;
1508
1509         case SO_BUF_LOCK:
1510                 if (val & ~SOCK_BUF_LOCK_MASK) {
1511                         ret = -EINVAL;
1512                         break;
1513                 }
1514                 sk->sk_userlocks = val | (sk->sk_userlocks &
1515                                           ~SOCK_BUF_LOCK_MASK);
1516                 break;
1517
1518         case SO_RESERVE_MEM:
1519         {
1520                 int delta;
1521
1522                 if (val < 0) {
1523                         ret = -EINVAL;
1524                         break;
1525                 }
1526
1527                 delta = val - sk->sk_reserved_mem;
1528                 if (delta < 0)
1529                         sock_release_reserved_memory(sk, -delta);
1530                 else
1531                         ret = sock_reserve_memory(sk, delta);
1532                 break;
1533         }
1534
1535         case SO_TXREHASH:
1536                 if (val < -1 || val > 1) {
1537                         ret = -EINVAL;
1538                         break;
1539                 }
1540                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1541                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1542                 /* Paired with READ_ONCE() in tcp_rtx_synack()
1543                  * and sk_getsockopt().
1544                  */
1545                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1546                 break;
1547
1548         default:
1549                 ret = -ENOPROTOOPT;
1550                 break;
1551         }
1552         sockopt_release_sock(sk);
1553         return ret;
1554 }
1555
1556 int sock_setsockopt(struct socket *sock, int level, int optname,
1557                     sockptr_t optval, unsigned int optlen)
1558 {
1559         return sk_setsockopt(sock->sk, level, optname,
1560                              optval, optlen);
1561 }
1562 EXPORT_SYMBOL(sock_setsockopt);
1563
1564 static const struct cred *sk_get_peer_cred(struct sock *sk)
1565 {
1566         const struct cred *cred;
1567
1568         spin_lock(&sk->sk_peer_lock);
1569         cred = get_cred(sk->sk_peer_cred);
1570         spin_unlock(&sk->sk_peer_lock);
1571
1572         return cred;
1573 }
1574
1575 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1576                           struct ucred *ucred)
1577 {
1578         ucred->pid = pid_vnr(pid);
1579         ucred->uid = ucred->gid = -1;
1580         if (cred) {
1581                 struct user_namespace *current_ns = current_user_ns();
1582
1583                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1584                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1585         }
1586 }
1587
1588 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1589 {
1590         struct user_namespace *user_ns = current_user_ns();
1591         int i;
1592
1593         for (i = 0; i < src->ngroups; i++) {
1594                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1595
1596                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1597                         return -EFAULT;
1598         }
1599
1600         return 0;
1601 }
1602
1603 int sk_getsockopt(struct sock *sk, int level, int optname,
1604                   sockptr_t optval, sockptr_t optlen)
1605 {
1606         struct socket *sock = sk->sk_socket;
1607
1608         union {
1609                 int val;
1610                 u64 val64;
1611                 unsigned long ulval;
1612                 struct linger ling;
1613                 struct old_timeval32 tm32;
1614                 struct __kernel_old_timeval tm;
1615                 struct  __kernel_sock_timeval stm;
1616                 struct sock_txtime txtime;
1617                 struct so_timestamping timestamping;
1618         } v;
1619
1620         int lv = sizeof(int);
1621         int len;
1622
1623         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1624                 return -EFAULT;
1625         if (len < 0)
1626                 return -EINVAL;
1627
1628         memset(&v, 0, sizeof(v));
1629
1630         switch (optname) {
1631         case SO_DEBUG:
1632                 v.val = sock_flag(sk, SOCK_DBG);
1633                 break;
1634
1635         case SO_DONTROUTE:
1636                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1637                 break;
1638
1639         case SO_BROADCAST:
1640                 v.val = sock_flag(sk, SOCK_BROADCAST);
1641                 break;
1642
1643         case SO_SNDBUF:
1644                 v.val = READ_ONCE(sk->sk_sndbuf);
1645                 break;
1646
1647         case SO_RCVBUF:
1648                 v.val = READ_ONCE(sk->sk_rcvbuf);
1649                 break;
1650
1651         case SO_REUSEADDR:
1652                 v.val = sk->sk_reuse;
1653                 break;
1654
1655         case SO_REUSEPORT:
1656                 v.val = sk->sk_reuseport;
1657                 break;
1658
1659         case SO_KEEPALIVE:
1660                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1661                 break;
1662
1663         case SO_TYPE:
1664                 v.val = sk->sk_type;
1665                 break;
1666
1667         case SO_PROTOCOL:
1668                 v.val = sk->sk_protocol;
1669                 break;
1670
1671         case SO_DOMAIN:
1672                 v.val = sk->sk_family;
1673                 break;
1674
1675         case SO_ERROR:
1676                 v.val = -sock_error(sk);
1677                 if (v.val == 0)
1678                         v.val = xchg(&sk->sk_err_soft, 0);
1679                 break;
1680
1681         case SO_OOBINLINE:
1682                 v.val = sock_flag(sk, SOCK_URGINLINE);
1683                 break;
1684
1685         case SO_NO_CHECK:
1686                 v.val = sk->sk_no_check_tx;
1687                 break;
1688
1689         case SO_PRIORITY:
1690                 v.val = READ_ONCE(sk->sk_priority);
1691                 break;
1692
1693         case SO_LINGER:
1694                 lv              = sizeof(v.ling);
1695                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1696                 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1697                 break;
1698
1699         case SO_BSDCOMPAT:
1700                 break;
1701
1702         case SO_TIMESTAMP_OLD:
1703                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1704                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1705                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1706                 break;
1707
1708         case SO_TIMESTAMPNS_OLD:
1709                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1710                 break;
1711
1712         case SO_TIMESTAMP_NEW:
1713                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1714                 break;
1715
1716         case SO_TIMESTAMPNS_NEW:
1717                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1718                 break;
1719
1720         case SO_TIMESTAMPING_OLD:
1721         case SO_TIMESTAMPING_NEW:
1722                 lv = sizeof(v.timestamping);
1723                 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1724                  * returning the flags when they were set through the same option.
1725                  * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1726                  */
1727                 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1728                         v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1729                         v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1730                 }
1731                 break;
1732
1733         case SO_RCVTIMEO_OLD:
1734         case SO_RCVTIMEO_NEW:
1735                 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1736                                       SO_RCVTIMEO_OLD == optname);
1737                 break;
1738
1739         case SO_SNDTIMEO_OLD:
1740         case SO_SNDTIMEO_NEW:
1741                 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1742                                       SO_SNDTIMEO_OLD == optname);
1743                 break;
1744
1745         case SO_RCVLOWAT:
1746                 v.val = READ_ONCE(sk->sk_rcvlowat);
1747                 break;
1748
1749         case SO_SNDLOWAT:
1750                 v.val = 1;
1751                 break;
1752
1753         case SO_PASSCRED:
1754                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1755                 break;
1756
1757         case SO_PASSPIDFD:
1758                 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1759                 break;
1760
1761         case SO_PEERCRED:
1762         {
1763                 struct ucred peercred;
1764                 if (len > sizeof(peercred))
1765                         len = sizeof(peercred);
1766
1767                 spin_lock(&sk->sk_peer_lock);
1768                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1769                 spin_unlock(&sk->sk_peer_lock);
1770
1771                 if (copy_to_sockptr(optval, &peercred, len))
1772                         return -EFAULT;
1773                 goto lenout;
1774         }
1775
1776         case SO_PEERPIDFD:
1777         {
1778                 struct pid *peer_pid;
1779                 struct file *pidfd_file = NULL;
1780                 int pidfd;
1781
1782                 if (len > sizeof(pidfd))
1783                         len = sizeof(pidfd);
1784
1785                 spin_lock(&sk->sk_peer_lock);
1786                 peer_pid = get_pid(sk->sk_peer_pid);
1787                 spin_unlock(&sk->sk_peer_lock);
1788
1789                 if (!peer_pid)
1790                         return -ENODATA;
1791
1792                 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1793                 put_pid(peer_pid);
1794                 if (pidfd < 0)
1795                         return pidfd;
1796
1797                 if (copy_to_sockptr(optval, &pidfd, len) ||
1798                     copy_to_sockptr(optlen, &len, sizeof(int))) {
1799                         put_unused_fd(pidfd);
1800                         fput(pidfd_file);
1801
1802                         return -EFAULT;
1803                 }
1804
1805                 fd_install(pidfd, pidfd_file);
1806                 return 0;
1807         }
1808
1809         case SO_PEERGROUPS:
1810         {
1811                 const struct cred *cred;
1812                 int ret, n;
1813
1814                 cred = sk_get_peer_cred(sk);
1815                 if (!cred)
1816                         return -ENODATA;
1817
1818                 n = cred->group_info->ngroups;
1819                 if (len < n * sizeof(gid_t)) {
1820                         len = n * sizeof(gid_t);
1821                         put_cred(cred);
1822                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1823                 }
1824                 len = n * sizeof(gid_t);
1825
1826                 ret = groups_to_user(optval, cred->group_info);
1827                 put_cred(cred);
1828                 if (ret)
1829                         return ret;
1830                 goto lenout;
1831         }
1832
1833         case SO_PEERNAME:
1834         {
1835                 struct sockaddr_storage address;
1836
1837                 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1838                 if (lv < 0)
1839                         return -ENOTCONN;
1840                 if (lv < len)
1841                         return -EINVAL;
1842                 if (copy_to_sockptr(optval, &address, len))
1843                         return -EFAULT;
1844                 goto lenout;
1845         }
1846
1847         /* Dubious BSD thing... Probably nobody even uses it, but
1848          * the UNIX standard wants it for whatever reason... -DaveM
1849          */
1850         case SO_ACCEPTCONN:
1851                 v.val = sk->sk_state == TCP_LISTEN;
1852                 break;
1853
1854         case SO_PASSSEC:
1855                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1856                 break;
1857
1858         case SO_PEERSEC:
1859                 return security_socket_getpeersec_stream(sock,
1860                                                          optval, optlen, len);
1861
1862         case SO_MARK:
1863                 v.val = READ_ONCE(sk->sk_mark);
1864                 break;
1865
1866         case SO_RCVMARK:
1867                 v.val = sock_flag(sk, SOCK_RCVMARK);
1868                 break;
1869
1870         case SO_RXQ_OVFL:
1871                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1872                 break;
1873
1874         case SO_WIFI_STATUS:
1875                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1876                 break;
1877
1878         case SO_PEEK_OFF:
1879                 if (!READ_ONCE(sock->ops)->set_peek_off)
1880                         return -EOPNOTSUPP;
1881
1882                 v.val = READ_ONCE(sk->sk_peek_off);
1883                 break;
1884         case SO_NOFCS:
1885                 v.val = sock_flag(sk, SOCK_NOFCS);
1886                 break;
1887
1888         case SO_BINDTODEVICE:
1889                 return sock_getbindtodevice(sk, optval, optlen, len);
1890
1891         case SO_GET_FILTER:
1892                 len = sk_get_filter(sk, optval, len);
1893                 if (len < 0)
1894                         return len;
1895
1896                 goto lenout;
1897
1898         case SO_LOCK_FILTER:
1899                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1900                 break;
1901
1902         case SO_BPF_EXTENSIONS:
1903                 v.val = bpf_tell_extensions();
1904                 break;
1905
1906         case SO_SELECT_ERR_QUEUE:
1907                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1908                 break;
1909
1910 #ifdef CONFIG_NET_RX_BUSY_POLL
1911         case SO_BUSY_POLL:
1912                 v.val = READ_ONCE(sk->sk_ll_usec);
1913                 break;
1914         case SO_PREFER_BUSY_POLL:
1915                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1916                 break;
1917 #endif
1918
1919         case SO_MAX_PACING_RATE:
1920                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1921                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1922                         lv = sizeof(v.ulval);
1923                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1924                 } else {
1925                         /* 32bit version */
1926                         v.val = min_t(unsigned long, ~0U,
1927                                       READ_ONCE(sk->sk_max_pacing_rate));
1928                 }
1929                 break;
1930
1931         case SO_INCOMING_CPU:
1932                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1933                 break;
1934
1935         case SO_MEMINFO:
1936         {
1937                 u32 meminfo[SK_MEMINFO_VARS];
1938
1939                 sk_get_meminfo(sk, meminfo);
1940
1941                 len = min_t(unsigned int, len, sizeof(meminfo));
1942                 if (copy_to_sockptr(optval, &meminfo, len))
1943                         return -EFAULT;
1944
1945                 goto lenout;
1946         }
1947
1948 #ifdef CONFIG_NET_RX_BUSY_POLL
1949         case SO_INCOMING_NAPI_ID:
1950                 v.val = READ_ONCE(sk->sk_napi_id);
1951
1952                 /* aggregate non-NAPI IDs down to 0 */
1953                 if (v.val < MIN_NAPI_ID)
1954                         v.val = 0;
1955
1956                 break;
1957 #endif
1958
1959         case SO_COOKIE:
1960                 lv = sizeof(u64);
1961                 if (len < lv)
1962                         return -EINVAL;
1963                 v.val64 = sock_gen_cookie(sk);
1964                 break;
1965
1966         case SO_ZEROCOPY:
1967                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1968                 break;
1969
1970         case SO_TXTIME:
1971                 lv = sizeof(v.txtime);
1972                 v.txtime.clockid = sk->sk_clockid;
1973                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1974                                   SOF_TXTIME_DEADLINE_MODE : 0;
1975                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1976                                   SOF_TXTIME_REPORT_ERRORS : 0;
1977                 break;
1978
1979         case SO_BINDTOIFINDEX:
1980                 v.val = READ_ONCE(sk->sk_bound_dev_if);
1981                 break;
1982
1983         case SO_NETNS_COOKIE:
1984                 lv = sizeof(u64);
1985                 if (len != lv)
1986                         return -EINVAL;
1987                 v.val64 = sock_net(sk)->net_cookie;
1988                 break;
1989
1990         case SO_BUF_LOCK:
1991                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1992                 break;
1993
1994         case SO_RESERVE_MEM:
1995                 v.val = READ_ONCE(sk->sk_reserved_mem);
1996                 break;
1997
1998         case SO_TXREHASH:
1999                 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2000                 v.val = READ_ONCE(sk->sk_txrehash);
2001                 break;
2002
2003         default:
2004                 /* We implement the SO_SNDLOWAT etc to not be settable
2005                  * (1003.1g 7).
2006                  */
2007                 return -ENOPROTOOPT;
2008         }
2009
2010         if (len > lv)
2011                 len = lv;
2012         if (copy_to_sockptr(optval, &v, len))
2013                 return -EFAULT;
2014 lenout:
2015         if (copy_to_sockptr(optlen, &len, sizeof(int)))
2016                 return -EFAULT;
2017         return 0;
2018 }
2019
2020 int sock_getsockopt(struct socket *sock, int level, int optname,
2021                     char __user *optval, int __user *optlen)
2022 {
2023         return sk_getsockopt(sock->sk, level, optname,
2024                              USER_SOCKPTR(optval),
2025                              USER_SOCKPTR(optlen));
2026 }
2027
2028 /*
2029  * Initialize an sk_lock.
2030  *
2031  * (We also register the sk_lock with the lock validator.)
2032  */
2033 static inline void sock_lock_init(struct sock *sk)
2034 {
2035         if (sk->sk_kern_sock)
2036                 sock_lock_init_class_and_name(
2037                         sk,
2038                         af_family_kern_slock_key_strings[sk->sk_family],
2039                         af_family_kern_slock_keys + sk->sk_family,
2040                         af_family_kern_key_strings[sk->sk_family],
2041                         af_family_kern_keys + sk->sk_family);
2042         else
2043                 sock_lock_init_class_and_name(
2044                         sk,
2045                         af_family_slock_key_strings[sk->sk_family],
2046                         af_family_slock_keys + sk->sk_family,
2047                         af_family_key_strings[sk->sk_family],
2048                         af_family_keys + sk->sk_family);
2049 }
2050
2051 /*
2052  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2053  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2054  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2055  */
2056 static void sock_copy(struct sock *nsk, const struct sock *osk)
2057 {
2058         const struct proto *prot = READ_ONCE(osk->sk_prot);
2059 #ifdef CONFIG_SECURITY_NETWORK
2060         void *sptr = nsk->sk_security;
2061 #endif
2062
2063         /* If we move sk_tx_queue_mapping out of the private section,
2064          * we must check if sk_tx_queue_clear() is called after
2065          * sock_copy() in sk_clone_lock().
2066          */
2067         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2068                      offsetof(struct sock, sk_dontcopy_begin) ||
2069                      offsetof(struct sock, sk_tx_queue_mapping) >=
2070                      offsetof(struct sock, sk_dontcopy_end));
2071
2072         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2073
2074         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2075                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2076
2077 #ifdef CONFIG_SECURITY_NETWORK
2078         nsk->sk_security = sptr;
2079         security_sk_clone(osk, nsk);
2080 #endif
2081 }
2082
2083 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2084                 int family)
2085 {
2086         struct sock *sk;
2087         struct kmem_cache *slab;
2088
2089         slab = prot->slab;
2090         if (slab != NULL) {
2091                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2092                 if (!sk)
2093                         return sk;
2094                 if (want_init_on_alloc(priority))
2095                         sk_prot_clear_nulls(sk, prot->obj_size);
2096         } else
2097                 sk = kmalloc(prot->obj_size, priority);
2098
2099         if (sk != NULL) {
2100                 if (security_sk_alloc(sk, family, priority))
2101                         goto out_free;
2102
2103                 if (!try_module_get(prot->owner))
2104                         goto out_free_sec;
2105         }
2106
2107         return sk;
2108
2109 out_free_sec:
2110         security_sk_free(sk);
2111 out_free:
2112         if (slab != NULL)
2113                 kmem_cache_free(slab, sk);
2114         else
2115                 kfree(sk);
2116         return NULL;
2117 }
2118
2119 static void sk_prot_free(struct proto *prot, struct sock *sk)
2120 {
2121         struct kmem_cache *slab;
2122         struct module *owner;
2123
2124         owner = prot->owner;
2125         slab = prot->slab;
2126
2127         cgroup_sk_free(&sk->sk_cgrp_data);
2128         mem_cgroup_sk_free(sk);
2129         security_sk_free(sk);
2130         if (slab != NULL)
2131                 kmem_cache_free(slab, sk);
2132         else
2133                 kfree(sk);
2134         module_put(owner);
2135 }
2136
2137 /**
2138  *      sk_alloc - All socket objects are allocated here
2139  *      @net: the applicable net namespace
2140  *      @family: protocol family
2141  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2142  *      @prot: struct proto associated with this new sock instance
2143  *      @kern: is this to be a kernel socket?
2144  */
2145 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2146                       struct proto *prot, int kern)
2147 {
2148         struct sock *sk;
2149
2150         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2151         if (sk) {
2152                 sk->sk_family = family;
2153                 /*
2154                  * See comment in struct sock definition to understand
2155                  * why we need sk_prot_creator -acme
2156                  */
2157                 sk->sk_prot = sk->sk_prot_creator = prot;
2158                 sk->sk_kern_sock = kern;
2159                 sock_lock_init(sk);
2160                 sk->sk_net_refcnt = kern ? 0 : 1;
2161                 if (likely(sk->sk_net_refcnt)) {
2162                         get_net_track(net, &sk->ns_tracker, priority);
2163                         sock_inuse_add(net, 1);
2164                 } else {
2165                         __netns_tracker_alloc(net, &sk->ns_tracker,
2166                                               false, priority);
2167                 }
2168
2169                 sock_net_set(sk, net);
2170                 refcount_set(&sk->sk_wmem_alloc, 1);
2171
2172                 mem_cgroup_sk_alloc(sk);
2173                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2174                 sock_update_classid(&sk->sk_cgrp_data);
2175                 sock_update_netprioidx(&sk->sk_cgrp_data);
2176                 sk_tx_queue_clear(sk);
2177         }
2178
2179         return sk;
2180 }
2181 EXPORT_SYMBOL(sk_alloc);
2182
2183 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2184  * grace period. This is the case for UDP sockets and TCP listeners.
2185  */
2186 static void __sk_destruct(struct rcu_head *head)
2187 {
2188         struct sock *sk = container_of(head, struct sock, sk_rcu);
2189         struct sk_filter *filter;
2190
2191         if (sk->sk_destruct)
2192                 sk->sk_destruct(sk);
2193
2194         filter = rcu_dereference_check(sk->sk_filter,
2195                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2196         if (filter) {
2197                 sk_filter_uncharge(sk, filter);
2198                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2199         }
2200
2201         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2202
2203 #ifdef CONFIG_BPF_SYSCALL
2204         bpf_sk_storage_free(sk);
2205 #endif
2206
2207         if (atomic_read(&sk->sk_omem_alloc))
2208                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2209                          __func__, atomic_read(&sk->sk_omem_alloc));
2210
2211         if (sk->sk_frag.page) {
2212                 put_page(sk->sk_frag.page);
2213                 sk->sk_frag.page = NULL;
2214         }
2215
2216         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2217         put_cred(sk->sk_peer_cred);
2218         put_pid(sk->sk_peer_pid);
2219
2220         if (likely(sk->sk_net_refcnt))
2221                 put_net_track(sock_net(sk), &sk->ns_tracker);
2222         else
2223                 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2224
2225         sk_prot_free(sk->sk_prot_creator, sk);
2226 }
2227
2228 void sk_destruct(struct sock *sk)
2229 {
2230         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2231
2232         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2233                 reuseport_detach_sock(sk);
2234                 use_call_rcu = true;
2235         }
2236
2237         if (use_call_rcu)
2238                 call_rcu(&sk->sk_rcu, __sk_destruct);
2239         else
2240                 __sk_destruct(&sk->sk_rcu);
2241 }
2242
2243 static void __sk_free(struct sock *sk)
2244 {
2245         if (likely(sk->sk_net_refcnt))
2246                 sock_inuse_add(sock_net(sk), -1);
2247
2248         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2249                 sock_diag_broadcast_destroy(sk);
2250         else
2251                 sk_destruct(sk);
2252 }
2253
2254 void sk_free(struct sock *sk)
2255 {
2256         /*
2257          * We subtract one from sk_wmem_alloc and can know if
2258          * some packets are still in some tx queue.
2259          * If not null, sock_wfree() will call __sk_free(sk) later
2260          */
2261         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2262                 __sk_free(sk);
2263 }
2264 EXPORT_SYMBOL(sk_free);
2265
2266 static void sk_init_common(struct sock *sk)
2267 {
2268         skb_queue_head_init(&sk->sk_receive_queue);
2269         skb_queue_head_init(&sk->sk_write_queue);
2270         skb_queue_head_init(&sk->sk_error_queue);
2271
2272         rwlock_init(&sk->sk_callback_lock);
2273         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2274                         af_rlock_keys + sk->sk_family,
2275                         af_family_rlock_key_strings[sk->sk_family]);
2276         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2277                         af_wlock_keys + sk->sk_family,
2278                         af_family_wlock_key_strings[sk->sk_family]);
2279         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2280                         af_elock_keys + sk->sk_family,
2281                         af_family_elock_key_strings[sk->sk_family]);
2282         lockdep_set_class_and_name(&sk->sk_callback_lock,
2283                         af_callback_keys + sk->sk_family,
2284                         af_family_clock_key_strings[sk->sk_family]);
2285 }
2286
2287 /**
2288  *      sk_clone_lock - clone a socket, and lock its clone
2289  *      @sk: the socket to clone
2290  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2291  *
2292  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2293  */
2294 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2295 {
2296         struct proto *prot = READ_ONCE(sk->sk_prot);
2297         struct sk_filter *filter;
2298         bool is_charged = true;
2299         struct sock *newsk;
2300
2301         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2302         if (!newsk)
2303                 goto out;
2304
2305         sock_copy(newsk, sk);
2306
2307         newsk->sk_prot_creator = prot;
2308
2309         /* SANITY */
2310         if (likely(newsk->sk_net_refcnt)) {
2311                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2312                 sock_inuse_add(sock_net(newsk), 1);
2313         } else {
2314                 /* Kernel sockets are not elevating the struct net refcount.
2315                  * Instead, use a tracker to more easily detect if a layer
2316                  * is not properly dismantling its kernel sockets at netns
2317                  * destroy time.
2318                  */
2319                 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2320                                       false, priority);
2321         }
2322         sk_node_init(&newsk->sk_node);
2323         sock_lock_init(newsk);
2324         bh_lock_sock(newsk);
2325         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2326         newsk->sk_backlog.len = 0;
2327
2328         atomic_set(&newsk->sk_rmem_alloc, 0);
2329
2330         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2331         refcount_set(&newsk->sk_wmem_alloc, 1);
2332
2333         atomic_set(&newsk->sk_omem_alloc, 0);
2334         sk_init_common(newsk);
2335
2336         newsk->sk_dst_cache     = NULL;
2337         newsk->sk_dst_pending_confirm = 0;
2338         newsk->sk_wmem_queued   = 0;
2339         newsk->sk_forward_alloc = 0;
2340         newsk->sk_reserved_mem  = 0;
2341         atomic_set(&newsk->sk_drops, 0);
2342         newsk->sk_send_head     = NULL;
2343         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2344         atomic_set(&newsk->sk_zckey, 0);
2345
2346         sock_reset_flag(newsk, SOCK_DONE);
2347
2348         /* sk->sk_memcg will be populated at accept() time */
2349         newsk->sk_memcg = NULL;
2350
2351         cgroup_sk_clone(&newsk->sk_cgrp_data);
2352
2353         rcu_read_lock();
2354         filter = rcu_dereference(sk->sk_filter);
2355         if (filter != NULL)
2356                 /* though it's an empty new sock, the charging may fail
2357                  * if sysctl_optmem_max was changed between creation of
2358                  * original socket and cloning
2359                  */
2360                 is_charged = sk_filter_charge(newsk, filter);
2361         RCU_INIT_POINTER(newsk->sk_filter, filter);
2362         rcu_read_unlock();
2363
2364         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2365                 /* We need to make sure that we don't uncharge the new
2366                  * socket if we couldn't charge it in the first place
2367                  * as otherwise we uncharge the parent's filter.
2368                  */
2369                 if (!is_charged)
2370                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2371                 sk_free_unlock_clone(newsk);
2372                 newsk = NULL;
2373                 goto out;
2374         }
2375         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2376
2377         if (bpf_sk_storage_clone(sk, newsk)) {
2378                 sk_free_unlock_clone(newsk);
2379                 newsk = NULL;
2380                 goto out;
2381         }
2382
2383         /* Clear sk_user_data if parent had the pointer tagged
2384          * as not suitable for copying when cloning.
2385          */
2386         if (sk_user_data_is_nocopy(newsk))
2387                 newsk->sk_user_data = NULL;
2388
2389         newsk->sk_err      = 0;
2390         newsk->sk_err_soft = 0;
2391         newsk->sk_priority = 0;
2392         newsk->sk_incoming_cpu = raw_smp_processor_id();
2393
2394         /* Before updating sk_refcnt, we must commit prior changes to memory
2395          * (Documentation/RCU/rculist_nulls.rst for details)
2396          */
2397         smp_wmb();
2398         refcount_set(&newsk->sk_refcnt, 2);
2399
2400         sk_set_socket(newsk, NULL);
2401         sk_tx_queue_clear(newsk);
2402         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2403
2404         if (newsk->sk_prot->sockets_allocated)
2405                 sk_sockets_allocated_inc(newsk);
2406
2407         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2408                 net_enable_timestamp();
2409 out:
2410         return newsk;
2411 }
2412 EXPORT_SYMBOL_GPL(sk_clone_lock);
2413
2414 void sk_free_unlock_clone(struct sock *sk)
2415 {
2416         /* It is still raw copy of parent, so invalidate
2417          * destructor and make plain sk_free() */
2418         sk->sk_destruct = NULL;
2419         bh_unlock_sock(sk);
2420         sk_free(sk);
2421 }
2422 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2423
2424 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2425 {
2426         bool is_ipv6 = false;
2427         u32 max_size;
2428
2429 #if IS_ENABLED(CONFIG_IPV6)
2430         is_ipv6 = (sk->sk_family == AF_INET6 &&
2431                    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2432 #endif
2433         /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2434         max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2435                         READ_ONCE(dst->dev->gso_ipv4_max_size);
2436         if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2437                 max_size = GSO_LEGACY_MAX_SIZE;
2438
2439         return max_size - (MAX_TCP_HEADER + 1);
2440 }
2441
2442 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2443 {
2444         u32 max_segs = 1;
2445
2446         sk->sk_route_caps = dst->dev->features;
2447         if (sk_is_tcp(sk))
2448                 sk->sk_route_caps |= NETIF_F_GSO;
2449         if (sk->sk_route_caps & NETIF_F_GSO)
2450                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2451         if (unlikely(sk->sk_gso_disabled))
2452                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2453         if (sk_can_gso(sk)) {
2454                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2455                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2456                 } else {
2457                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2458                         sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2459                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2460                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2461                 }
2462         }
2463         sk->sk_gso_max_segs = max_segs;
2464         sk_dst_set(sk, dst);
2465 }
2466 EXPORT_SYMBOL_GPL(sk_setup_caps);
2467
2468 /*
2469  *      Simple resource managers for sockets.
2470  */
2471
2472
2473 /*
2474  * Write buffer destructor automatically called from kfree_skb.
2475  */
2476 void sock_wfree(struct sk_buff *skb)
2477 {
2478         struct sock *sk = skb->sk;
2479         unsigned int len = skb->truesize;
2480         bool free;
2481
2482         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2483                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2484                     sk->sk_write_space == sock_def_write_space) {
2485                         rcu_read_lock();
2486                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2487                         sock_def_write_space_wfree(sk);
2488                         rcu_read_unlock();
2489                         if (unlikely(free))
2490                                 __sk_free(sk);
2491                         return;
2492                 }
2493
2494                 /*
2495                  * Keep a reference on sk_wmem_alloc, this will be released
2496                  * after sk_write_space() call
2497                  */
2498                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2499                 sk->sk_write_space(sk);
2500                 len = 1;
2501         }
2502         /*
2503          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2504          * could not do because of in-flight packets
2505          */
2506         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2507                 __sk_free(sk);
2508 }
2509 EXPORT_SYMBOL(sock_wfree);
2510
2511 /* This variant of sock_wfree() is used by TCP,
2512  * since it sets SOCK_USE_WRITE_QUEUE.
2513  */
2514 void __sock_wfree(struct sk_buff *skb)
2515 {
2516         struct sock *sk = skb->sk;
2517
2518         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2519                 __sk_free(sk);
2520 }
2521
2522 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2523 {
2524         skb_orphan(skb);
2525         skb->sk = sk;
2526 #ifdef CONFIG_INET
2527         if (unlikely(!sk_fullsock(sk))) {
2528                 skb->destructor = sock_edemux;
2529                 sock_hold(sk);
2530                 return;
2531         }
2532 #endif
2533         skb->destructor = sock_wfree;
2534         skb_set_hash_from_sk(skb, sk);
2535         /*
2536          * We used to take a refcount on sk, but following operation
2537          * is enough to guarantee sk_free() wont free this sock until
2538          * all in-flight packets are completed
2539          */
2540         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2541 }
2542 EXPORT_SYMBOL(skb_set_owner_w);
2543
2544 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2545 {
2546 #ifdef CONFIG_TLS_DEVICE
2547         /* Drivers depend on in-order delivery for crypto offload,
2548          * partial orphan breaks out-of-order-OK logic.
2549          */
2550         if (skb->decrypted)
2551                 return false;
2552 #endif
2553         return (skb->destructor == sock_wfree ||
2554                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2555 }
2556
2557 /* This helper is used by netem, as it can hold packets in its
2558  * delay queue. We want to allow the owner socket to send more
2559  * packets, as if they were already TX completed by a typical driver.
2560  * But we also want to keep skb->sk set because some packet schedulers
2561  * rely on it (sch_fq for example).
2562  */
2563 void skb_orphan_partial(struct sk_buff *skb)
2564 {
2565         if (skb_is_tcp_pure_ack(skb))
2566                 return;
2567
2568         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2569                 return;
2570
2571         skb_orphan(skb);
2572 }
2573 EXPORT_SYMBOL(skb_orphan_partial);
2574
2575 /*
2576  * Read buffer destructor automatically called from kfree_skb.
2577  */
2578 void sock_rfree(struct sk_buff *skb)
2579 {
2580         struct sock *sk = skb->sk;
2581         unsigned int len = skb->truesize;
2582
2583         atomic_sub(len, &sk->sk_rmem_alloc);
2584         sk_mem_uncharge(sk, len);
2585 }
2586 EXPORT_SYMBOL(sock_rfree);
2587
2588 /*
2589  * Buffer destructor for skbs that are not used directly in read or write
2590  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2591  */
2592 void sock_efree(struct sk_buff *skb)
2593 {
2594         sock_put(skb->sk);
2595 }
2596 EXPORT_SYMBOL(sock_efree);
2597
2598 /* Buffer destructor for prefetch/receive path where reference count may
2599  * not be held, e.g. for listen sockets.
2600  */
2601 #ifdef CONFIG_INET
2602 void sock_pfree(struct sk_buff *skb)
2603 {
2604         if (sk_is_refcounted(skb->sk))
2605                 sock_gen_put(skb->sk);
2606 }
2607 EXPORT_SYMBOL(sock_pfree);
2608 #endif /* CONFIG_INET */
2609
2610 kuid_t sock_i_uid(struct sock *sk)
2611 {
2612         kuid_t uid;
2613
2614         read_lock_bh(&sk->sk_callback_lock);
2615         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2616         read_unlock_bh(&sk->sk_callback_lock);
2617         return uid;
2618 }
2619 EXPORT_SYMBOL(sock_i_uid);
2620
2621 unsigned long __sock_i_ino(struct sock *sk)
2622 {
2623         unsigned long ino;
2624
2625         read_lock(&sk->sk_callback_lock);
2626         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2627         read_unlock(&sk->sk_callback_lock);
2628         return ino;
2629 }
2630 EXPORT_SYMBOL(__sock_i_ino);
2631
2632 unsigned long sock_i_ino(struct sock *sk)
2633 {
2634         unsigned long ino;
2635
2636         local_bh_disable();
2637         ino = __sock_i_ino(sk);
2638         local_bh_enable();
2639         return ino;
2640 }
2641 EXPORT_SYMBOL(sock_i_ino);
2642
2643 /*
2644  * Allocate a skb from the socket's send buffer.
2645  */
2646 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2647                              gfp_t priority)
2648 {
2649         if (force ||
2650             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2651                 struct sk_buff *skb = alloc_skb(size, priority);
2652
2653                 if (skb) {
2654                         skb_set_owner_w(skb, sk);
2655                         return skb;
2656                 }
2657         }
2658         return NULL;
2659 }
2660 EXPORT_SYMBOL(sock_wmalloc);
2661
2662 static void sock_ofree(struct sk_buff *skb)
2663 {
2664         struct sock *sk = skb->sk;
2665
2666         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2667 }
2668
2669 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2670                              gfp_t priority)
2671 {
2672         struct sk_buff *skb;
2673
2674         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2675         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2676             READ_ONCE(sysctl_optmem_max))
2677                 return NULL;
2678
2679         skb = alloc_skb(size, priority);
2680         if (!skb)
2681                 return NULL;
2682
2683         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2684         skb->sk = sk;
2685         skb->destructor = sock_ofree;
2686         return skb;
2687 }
2688
2689 /*
2690  * Allocate a memory block from the socket's option memory buffer.
2691  */
2692 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2693 {
2694         int optmem_max = READ_ONCE(sysctl_optmem_max);
2695
2696         if ((unsigned int)size <= optmem_max &&
2697             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2698                 void *mem;
2699                 /* First do the add, to avoid the race if kmalloc
2700                  * might sleep.
2701                  */
2702                 atomic_add(size, &sk->sk_omem_alloc);
2703                 mem = kmalloc(size, priority);
2704                 if (mem)
2705                         return mem;
2706                 atomic_sub(size, &sk->sk_omem_alloc);
2707         }
2708         return NULL;
2709 }
2710 EXPORT_SYMBOL(sock_kmalloc);
2711
2712 /* Free an option memory block. Note, we actually want the inline
2713  * here as this allows gcc to detect the nullify and fold away the
2714  * condition entirely.
2715  */
2716 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2717                                   const bool nullify)
2718 {
2719         if (WARN_ON_ONCE(!mem))
2720                 return;
2721         if (nullify)
2722                 kfree_sensitive(mem);
2723         else
2724                 kfree(mem);
2725         atomic_sub(size, &sk->sk_omem_alloc);
2726 }
2727
2728 void sock_kfree_s(struct sock *sk, void *mem, int size)
2729 {
2730         __sock_kfree_s(sk, mem, size, false);
2731 }
2732 EXPORT_SYMBOL(sock_kfree_s);
2733
2734 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2735 {
2736         __sock_kfree_s(sk, mem, size, true);
2737 }
2738 EXPORT_SYMBOL(sock_kzfree_s);
2739
2740 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2741    I think, these locks should be removed for datagram sockets.
2742  */
2743 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2744 {
2745         DEFINE_WAIT(wait);
2746
2747         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2748         for (;;) {
2749                 if (!timeo)
2750                         break;
2751                 if (signal_pending(current))
2752                         break;
2753                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2754                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2755                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2756                         break;
2757                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2758                         break;
2759                 if (READ_ONCE(sk->sk_err))
2760                         break;
2761                 timeo = schedule_timeout(timeo);
2762         }
2763         finish_wait(sk_sleep(sk), &wait);
2764         return timeo;
2765 }
2766
2767
2768 /*
2769  *      Generic send/receive buffer handlers
2770  */
2771
2772 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2773                                      unsigned long data_len, int noblock,
2774                                      int *errcode, int max_page_order)
2775 {
2776         struct sk_buff *skb;
2777         long timeo;
2778         int err;
2779
2780         timeo = sock_sndtimeo(sk, noblock);
2781         for (;;) {
2782                 err = sock_error(sk);
2783                 if (err != 0)
2784                         goto failure;
2785
2786                 err = -EPIPE;
2787                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2788                         goto failure;
2789
2790                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2791                         break;
2792
2793                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2794                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2795                 err = -EAGAIN;
2796                 if (!timeo)
2797                         goto failure;
2798                 if (signal_pending(current))
2799                         goto interrupted;
2800                 timeo = sock_wait_for_wmem(sk, timeo);
2801         }
2802         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2803                                    errcode, sk->sk_allocation);
2804         if (skb)
2805                 skb_set_owner_w(skb, sk);
2806         return skb;
2807
2808 interrupted:
2809         err = sock_intr_errno(timeo);
2810 failure:
2811         *errcode = err;
2812         return NULL;
2813 }
2814 EXPORT_SYMBOL(sock_alloc_send_pskb);
2815
2816 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2817                      struct sockcm_cookie *sockc)
2818 {
2819         u32 tsflags;
2820
2821         switch (cmsg->cmsg_type) {
2822         case SO_MARK:
2823                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2824                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2825                         return -EPERM;
2826                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2827                         return -EINVAL;
2828                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2829                 break;
2830         case SO_TIMESTAMPING_OLD:
2831         case SO_TIMESTAMPING_NEW:
2832                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2833                         return -EINVAL;
2834
2835                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2836                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2837                         return -EINVAL;
2838
2839                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2840                 sockc->tsflags |= tsflags;
2841                 break;
2842         case SCM_TXTIME:
2843                 if (!sock_flag(sk, SOCK_TXTIME))
2844                         return -EINVAL;
2845                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2846                         return -EINVAL;
2847                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2848                 break;
2849         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2850         case SCM_RIGHTS:
2851         case SCM_CREDENTIALS:
2852                 break;
2853         default:
2854                 return -EINVAL;
2855         }
2856         return 0;
2857 }
2858 EXPORT_SYMBOL(__sock_cmsg_send);
2859
2860 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2861                    struct sockcm_cookie *sockc)
2862 {
2863         struct cmsghdr *cmsg;
2864         int ret;
2865
2866         for_each_cmsghdr(cmsg, msg) {
2867                 if (!CMSG_OK(msg, cmsg))
2868                         return -EINVAL;
2869                 if (cmsg->cmsg_level != SOL_SOCKET)
2870                         continue;
2871                 ret = __sock_cmsg_send(sk, cmsg, sockc);
2872                 if (ret)
2873                         return ret;
2874         }
2875         return 0;
2876 }
2877 EXPORT_SYMBOL(sock_cmsg_send);
2878
2879 static void sk_enter_memory_pressure(struct sock *sk)
2880 {
2881         if (!sk->sk_prot->enter_memory_pressure)
2882                 return;
2883
2884         sk->sk_prot->enter_memory_pressure(sk);
2885 }
2886
2887 static void sk_leave_memory_pressure(struct sock *sk)
2888 {
2889         if (sk->sk_prot->leave_memory_pressure) {
2890                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2891                                      tcp_leave_memory_pressure, sk);
2892         } else {
2893                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2894
2895                 if (memory_pressure && READ_ONCE(*memory_pressure))
2896                         WRITE_ONCE(*memory_pressure, 0);
2897         }
2898 }
2899
2900 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2901
2902 /**
2903  * skb_page_frag_refill - check that a page_frag contains enough room
2904  * @sz: minimum size of the fragment we want to get
2905  * @pfrag: pointer to page_frag
2906  * @gfp: priority for memory allocation
2907  *
2908  * Note: While this allocator tries to use high order pages, there is
2909  * no guarantee that allocations succeed. Therefore, @sz MUST be
2910  * less or equal than PAGE_SIZE.
2911  */
2912 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2913 {
2914         if (pfrag->page) {
2915                 if (page_ref_count(pfrag->page) == 1) {
2916                         pfrag->offset = 0;
2917                         return true;
2918                 }
2919                 if (pfrag->offset + sz <= pfrag->size)
2920                         return true;
2921                 put_page(pfrag->page);
2922         }
2923
2924         pfrag->offset = 0;
2925         if (SKB_FRAG_PAGE_ORDER &&
2926             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2927                 /* Avoid direct reclaim but allow kswapd to wake */
2928                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2929                                           __GFP_COMP | __GFP_NOWARN |
2930                                           __GFP_NORETRY,
2931                                           SKB_FRAG_PAGE_ORDER);
2932                 if (likely(pfrag->page)) {
2933                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2934                         return true;
2935                 }
2936         }
2937         pfrag->page = alloc_page(gfp);
2938         if (likely(pfrag->page)) {
2939                 pfrag->size = PAGE_SIZE;
2940                 return true;
2941         }
2942         return false;
2943 }
2944 EXPORT_SYMBOL(skb_page_frag_refill);
2945
2946 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2947 {
2948         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2949                 return true;
2950
2951         sk_enter_memory_pressure(sk);
2952         sk_stream_moderate_sndbuf(sk);
2953         return false;
2954 }
2955 EXPORT_SYMBOL(sk_page_frag_refill);
2956
2957 void __lock_sock(struct sock *sk)
2958         __releases(&sk->sk_lock.slock)
2959         __acquires(&sk->sk_lock.slock)
2960 {
2961         DEFINE_WAIT(wait);
2962
2963         for (;;) {
2964                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2965                                         TASK_UNINTERRUPTIBLE);
2966                 spin_unlock_bh(&sk->sk_lock.slock);
2967                 schedule();
2968                 spin_lock_bh(&sk->sk_lock.slock);
2969                 if (!sock_owned_by_user(sk))
2970                         break;
2971         }
2972         finish_wait(&sk->sk_lock.wq, &wait);
2973 }
2974
2975 void __release_sock(struct sock *sk)
2976         __releases(&sk->sk_lock.slock)
2977         __acquires(&sk->sk_lock.slock)
2978 {
2979         struct sk_buff *skb, *next;
2980
2981         while ((skb = sk->sk_backlog.head) != NULL) {
2982                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2983
2984                 spin_unlock_bh(&sk->sk_lock.slock);
2985
2986                 do {
2987                         next = skb->next;
2988                         prefetch(next);
2989                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2990                         skb_mark_not_on_list(skb);
2991                         sk_backlog_rcv(sk, skb);
2992
2993                         cond_resched();
2994
2995                         skb = next;
2996                 } while (skb != NULL);
2997
2998                 spin_lock_bh(&sk->sk_lock.slock);
2999         }
3000
3001         /*
3002          * Doing the zeroing here guarantee we can not loop forever
3003          * while a wild producer attempts to flood us.
3004          */
3005         sk->sk_backlog.len = 0;
3006 }
3007
3008 void __sk_flush_backlog(struct sock *sk)
3009 {
3010         spin_lock_bh(&sk->sk_lock.slock);
3011         __release_sock(sk);
3012         spin_unlock_bh(&sk->sk_lock.slock);
3013 }
3014 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3015
3016 /**
3017  * sk_wait_data - wait for data to arrive at sk_receive_queue
3018  * @sk:    sock to wait on
3019  * @timeo: for how long
3020  * @skb:   last skb seen on sk_receive_queue
3021  *
3022  * Now socket state including sk->sk_err is changed only under lock,
3023  * hence we may omit checks after joining wait queue.
3024  * We check receive queue before schedule() only as optimization;
3025  * it is very likely that release_sock() added new data.
3026  */
3027 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3028 {
3029         DEFINE_WAIT_FUNC(wait, woken_wake_function);
3030         int rc;
3031
3032         add_wait_queue(sk_sleep(sk), &wait);
3033         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3034         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3035         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3036         remove_wait_queue(sk_sleep(sk), &wait);
3037         return rc;
3038 }
3039 EXPORT_SYMBOL(sk_wait_data);
3040
3041 /**
3042  *      __sk_mem_raise_allocated - increase memory_allocated
3043  *      @sk: socket
3044  *      @size: memory size to allocate
3045  *      @amt: pages to allocate
3046  *      @kind: allocation type
3047  *
3048  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3049  */
3050 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3051 {
3052         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3053         struct proto *prot = sk->sk_prot;
3054         bool charged = true;
3055         long allocated;
3056
3057         sk_memory_allocated_add(sk, amt);
3058         allocated = sk_memory_allocated(sk);
3059         if (memcg_charge &&
3060             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3061                                                 gfp_memcg_charge())))
3062                 goto suppress_allocation;
3063
3064         /* Under limit. */
3065         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3066                 sk_leave_memory_pressure(sk);
3067                 return 1;
3068         }
3069
3070         /* Under pressure. */
3071         if (allocated > sk_prot_mem_limits(sk, 1))
3072                 sk_enter_memory_pressure(sk);
3073
3074         /* Over hard limit. */
3075         if (allocated > sk_prot_mem_limits(sk, 2))
3076                 goto suppress_allocation;
3077
3078         /* guarantee minimum buffer size under pressure */
3079         if (kind == SK_MEM_RECV) {
3080                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3081                         return 1;
3082
3083         } else { /* SK_MEM_SEND */
3084                 int wmem0 = sk_get_wmem0(sk, prot);
3085
3086                 if (sk->sk_type == SOCK_STREAM) {
3087                         if (sk->sk_wmem_queued < wmem0)
3088                                 return 1;
3089                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3090                                 return 1;
3091                 }
3092         }
3093
3094         if (sk_has_memory_pressure(sk)) {
3095                 u64 alloc;
3096
3097                 if (!sk_under_memory_pressure(sk))
3098                         return 1;
3099                 alloc = sk_sockets_allocated_read_positive(sk);
3100                 if (sk_prot_mem_limits(sk, 2) > alloc *
3101                     sk_mem_pages(sk->sk_wmem_queued +
3102                                  atomic_read(&sk->sk_rmem_alloc) +
3103                                  sk->sk_forward_alloc))
3104                         return 1;
3105         }
3106
3107 suppress_allocation:
3108
3109         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3110                 sk_stream_moderate_sndbuf(sk);
3111
3112                 /* Fail only if socket is _under_ its sndbuf.
3113                  * In this case we cannot block, so that we have to fail.
3114                  */
3115                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3116                         /* Force charge with __GFP_NOFAIL */
3117                         if (memcg_charge && !charged) {
3118                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3119                                         gfp_memcg_charge() | __GFP_NOFAIL);
3120                         }
3121                         return 1;
3122                 }
3123         }
3124
3125         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3126                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3127
3128         sk_memory_allocated_sub(sk, amt);
3129
3130         if (memcg_charge && charged)
3131                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3132
3133         return 0;
3134 }
3135
3136 /**
3137  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3138  *      @sk: socket
3139  *      @size: memory size to allocate
3140  *      @kind: allocation type
3141  *
3142  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3143  *      rmem allocation. This function assumes that protocols which have
3144  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3145  */
3146 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3147 {
3148         int ret, amt = sk_mem_pages(size);
3149
3150         sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3151         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3152         if (!ret)
3153                 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3154         return ret;
3155 }
3156 EXPORT_SYMBOL(__sk_mem_schedule);
3157
3158 /**
3159  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3160  *      @sk: socket
3161  *      @amount: number of quanta
3162  *
3163  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3164  */
3165 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3166 {
3167         sk_memory_allocated_sub(sk, amount);
3168
3169         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3170                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3171
3172         if (sk_under_global_memory_pressure(sk) &&
3173             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3174                 sk_leave_memory_pressure(sk);
3175 }
3176
3177 /**
3178  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3179  *      @sk: socket
3180  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3181  */
3182 void __sk_mem_reclaim(struct sock *sk, int amount)
3183 {
3184         amount >>= PAGE_SHIFT;
3185         sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3186         __sk_mem_reduce_allocated(sk, amount);
3187 }
3188 EXPORT_SYMBOL(__sk_mem_reclaim);
3189
3190 int sk_set_peek_off(struct sock *sk, int val)
3191 {
3192         WRITE_ONCE(sk->sk_peek_off, val);
3193         return 0;
3194 }
3195 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3196
3197 /*
3198  * Set of default routines for initialising struct proto_ops when
3199  * the protocol does not support a particular function. In certain
3200  * cases where it makes no sense for a protocol to have a "do nothing"
3201  * function, some default processing is provided.
3202  */
3203
3204 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3205 {
3206         return -EOPNOTSUPP;
3207 }
3208 EXPORT_SYMBOL(sock_no_bind);
3209
3210 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3211                     int len, int flags)
3212 {
3213         return -EOPNOTSUPP;
3214 }
3215 EXPORT_SYMBOL(sock_no_connect);
3216
3217 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3218 {
3219         return -EOPNOTSUPP;
3220 }
3221 EXPORT_SYMBOL(sock_no_socketpair);
3222
3223 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3224                    bool kern)
3225 {
3226         return -EOPNOTSUPP;
3227 }
3228 EXPORT_SYMBOL(sock_no_accept);
3229
3230 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3231                     int peer)
3232 {
3233         return -EOPNOTSUPP;
3234 }
3235 EXPORT_SYMBOL(sock_no_getname);
3236
3237 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3238 {
3239         return -EOPNOTSUPP;
3240 }
3241 EXPORT_SYMBOL(sock_no_ioctl);
3242
3243 int sock_no_listen(struct socket *sock, int backlog)
3244 {
3245         return -EOPNOTSUPP;
3246 }
3247 EXPORT_SYMBOL(sock_no_listen);
3248
3249 int sock_no_shutdown(struct socket *sock, int how)
3250 {
3251         return -EOPNOTSUPP;
3252 }
3253 EXPORT_SYMBOL(sock_no_shutdown);
3254
3255 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3256 {
3257         return -EOPNOTSUPP;
3258 }
3259 EXPORT_SYMBOL(sock_no_sendmsg);
3260
3261 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3262 {
3263         return -EOPNOTSUPP;
3264 }
3265 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3266
3267 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3268                     int flags)
3269 {
3270         return -EOPNOTSUPP;
3271 }
3272 EXPORT_SYMBOL(sock_no_recvmsg);
3273
3274 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3275 {
3276         /* Mirror missing mmap method error code */
3277         return -ENODEV;
3278 }
3279 EXPORT_SYMBOL(sock_no_mmap);
3280
3281 /*
3282  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3283  * various sock-based usage counts.
3284  */
3285 void __receive_sock(struct file *file)
3286 {
3287         struct socket *sock;
3288
3289         sock = sock_from_file(file);
3290         if (sock) {
3291                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3292                 sock_update_classid(&sock->sk->sk_cgrp_data);
3293         }
3294 }
3295
3296 /*
3297  *      Default Socket Callbacks
3298  */
3299
3300 static void sock_def_wakeup(struct sock *sk)
3301 {
3302         struct socket_wq *wq;
3303
3304         rcu_read_lock();
3305         wq = rcu_dereference(sk->sk_wq);
3306         if (skwq_has_sleeper(wq))
3307                 wake_up_interruptible_all(&wq->wait);
3308         rcu_read_unlock();
3309 }
3310
3311 static void sock_def_error_report(struct sock *sk)
3312 {
3313         struct socket_wq *wq;
3314
3315         rcu_read_lock();
3316         wq = rcu_dereference(sk->sk_wq);
3317         if (skwq_has_sleeper(wq))
3318                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3319         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3320         rcu_read_unlock();
3321 }
3322
3323 void sock_def_readable(struct sock *sk)
3324 {
3325         struct socket_wq *wq;
3326
3327         trace_sk_data_ready(sk);
3328
3329         rcu_read_lock();
3330         wq = rcu_dereference(sk->sk_wq);
3331         if (skwq_has_sleeper(wq))
3332                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3333                                                 EPOLLRDNORM | EPOLLRDBAND);
3334         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3335         rcu_read_unlock();
3336 }
3337
3338 static void sock_def_write_space(struct sock *sk)
3339 {
3340         struct socket_wq *wq;
3341
3342         rcu_read_lock();
3343
3344         /* Do not wake up a writer until he can make "significant"
3345          * progress.  --DaveM
3346          */
3347         if (sock_writeable(sk)) {
3348                 wq = rcu_dereference(sk->sk_wq);
3349                 if (skwq_has_sleeper(wq))
3350                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3351                                                 EPOLLWRNORM | EPOLLWRBAND);
3352
3353                 /* Should agree with poll, otherwise some programs break */
3354                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3355         }
3356
3357         rcu_read_unlock();
3358 }
3359
3360 /* An optimised version of sock_def_write_space(), should only be called
3361  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3362  * ->sk_wmem_alloc.
3363  */
3364 static void sock_def_write_space_wfree(struct sock *sk)
3365 {
3366         /* Do not wake up a writer until he can make "significant"
3367          * progress.  --DaveM
3368          */
3369         if (sock_writeable(sk)) {
3370                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3371
3372                 /* rely on refcount_sub from sock_wfree() */
3373                 smp_mb__after_atomic();
3374                 if (wq && waitqueue_active(&wq->wait))
3375                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3376                                                 EPOLLWRNORM | EPOLLWRBAND);
3377
3378                 /* Should agree with poll, otherwise some programs break */
3379                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3380         }
3381 }
3382
3383 static void sock_def_destruct(struct sock *sk)
3384 {
3385 }
3386
3387 void sk_send_sigurg(struct sock *sk)
3388 {
3389         if (sk->sk_socket && sk->sk_socket->file)
3390                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3391                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3392 }
3393 EXPORT_SYMBOL(sk_send_sigurg);
3394
3395 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3396                     unsigned long expires)
3397 {
3398         if (!mod_timer(timer, expires))
3399                 sock_hold(sk);
3400 }
3401 EXPORT_SYMBOL(sk_reset_timer);
3402
3403 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3404 {
3405         if (del_timer(timer))
3406                 __sock_put(sk);
3407 }
3408 EXPORT_SYMBOL(sk_stop_timer);
3409
3410 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3411 {
3412         if (del_timer_sync(timer))
3413                 __sock_put(sk);
3414 }
3415 EXPORT_SYMBOL(sk_stop_timer_sync);
3416
3417 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3418 {
3419         sk_init_common(sk);
3420         sk->sk_send_head        =       NULL;
3421
3422         timer_setup(&sk->sk_timer, NULL, 0);
3423
3424         sk->sk_allocation       =       GFP_KERNEL;
3425         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3426         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3427         sk->sk_state            =       TCP_CLOSE;
3428         sk->sk_use_task_frag    =       true;
3429         sk_set_socket(sk, sock);
3430
3431         sock_set_flag(sk, SOCK_ZAPPED);
3432
3433         if (sock) {
3434                 sk->sk_type     =       sock->type;
3435                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3436                 sock->sk        =       sk;
3437         } else {
3438                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3439         }
3440         sk->sk_uid      =       uid;
3441
3442         rwlock_init(&sk->sk_callback_lock);
3443         if (sk->sk_kern_sock)
3444                 lockdep_set_class_and_name(
3445                         &sk->sk_callback_lock,
3446                         af_kern_callback_keys + sk->sk_family,
3447                         af_family_kern_clock_key_strings[sk->sk_family]);
3448         else
3449                 lockdep_set_class_and_name(
3450                         &sk->sk_callback_lock,
3451                         af_callback_keys + sk->sk_family,
3452                         af_family_clock_key_strings[sk->sk_family]);
3453
3454         sk->sk_state_change     =       sock_def_wakeup;
3455         sk->sk_data_ready       =       sock_def_readable;
3456         sk->sk_write_space      =       sock_def_write_space;
3457         sk->sk_error_report     =       sock_def_error_report;
3458         sk->sk_destruct         =       sock_def_destruct;
3459
3460         sk->sk_frag.page        =       NULL;
3461         sk->sk_frag.offset      =       0;
3462         sk->sk_peek_off         =       -1;
3463
3464         sk->sk_peer_pid         =       NULL;
3465         sk->sk_peer_cred        =       NULL;
3466         spin_lock_init(&sk->sk_peer_lock);
3467
3468         sk->sk_write_pending    =       0;
3469         sk->sk_rcvlowat         =       1;
3470         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3471         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3472
3473         sk->sk_stamp = SK_DEFAULT_STAMP;
3474 #if BITS_PER_LONG==32
3475         seqlock_init(&sk->sk_stamp_seq);
3476 #endif
3477         atomic_set(&sk->sk_zckey, 0);
3478
3479 #ifdef CONFIG_NET_RX_BUSY_POLL
3480         sk->sk_napi_id          =       0;
3481         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3482 #endif
3483
3484         sk->sk_max_pacing_rate = ~0UL;
3485         sk->sk_pacing_rate = ~0UL;
3486         WRITE_ONCE(sk->sk_pacing_shift, 10);
3487         sk->sk_incoming_cpu = -1;
3488
3489         sk_rx_queue_clear(sk);
3490         /*
3491          * Before updating sk_refcnt, we must commit prior changes to memory
3492          * (Documentation/RCU/rculist_nulls.rst for details)
3493          */
3494         smp_wmb();
3495         refcount_set(&sk->sk_refcnt, 1);
3496         atomic_set(&sk->sk_drops, 0);
3497 }
3498 EXPORT_SYMBOL(sock_init_data_uid);
3499
3500 void sock_init_data(struct socket *sock, struct sock *sk)
3501 {
3502         kuid_t uid = sock ?
3503                 SOCK_INODE(sock)->i_uid :
3504                 make_kuid(sock_net(sk)->user_ns, 0);
3505
3506         sock_init_data_uid(sock, sk, uid);
3507 }
3508 EXPORT_SYMBOL(sock_init_data);
3509
3510 void lock_sock_nested(struct sock *sk, int subclass)
3511 {
3512         /* The sk_lock has mutex_lock() semantics here. */
3513         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3514
3515         might_sleep();
3516         spin_lock_bh(&sk->sk_lock.slock);
3517         if (sock_owned_by_user_nocheck(sk))
3518                 __lock_sock(sk);
3519         sk->sk_lock.owned = 1;
3520         spin_unlock_bh(&sk->sk_lock.slock);
3521 }
3522 EXPORT_SYMBOL(lock_sock_nested);
3523
3524 void release_sock(struct sock *sk)
3525 {
3526         spin_lock_bh(&sk->sk_lock.slock);
3527         if (sk->sk_backlog.tail)
3528                 __release_sock(sk);
3529
3530         /* Warning : release_cb() might need to release sk ownership,
3531          * ie call sock_release_ownership(sk) before us.
3532          */
3533         if (sk->sk_prot->release_cb)
3534                 sk->sk_prot->release_cb(sk);
3535
3536         sock_release_ownership(sk);
3537         if (waitqueue_active(&sk->sk_lock.wq))
3538                 wake_up(&sk->sk_lock.wq);
3539         spin_unlock_bh(&sk->sk_lock.slock);
3540 }
3541 EXPORT_SYMBOL(release_sock);
3542
3543 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3544 {
3545         might_sleep();
3546         spin_lock_bh(&sk->sk_lock.slock);
3547
3548         if (!sock_owned_by_user_nocheck(sk)) {
3549                 /*
3550                  * Fast path return with bottom halves disabled and
3551                  * sock::sk_lock.slock held.
3552                  *
3553                  * The 'mutex' is not contended and holding
3554                  * sock::sk_lock.slock prevents all other lockers to
3555                  * proceed so the corresponding unlock_sock_fast() can
3556                  * avoid the slow path of release_sock() completely and
3557                  * just release slock.
3558                  *
3559                  * From a semantical POV this is equivalent to 'acquiring'
3560                  * the 'mutex', hence the corresponding lockdep
3561                  * mutex_release() has to happen in the fast path of
3562                  * unlock_sock_fast().
3563                  */
3564                 return false;
3565         }
3566
3567         __lock_sock(sk);
3568         sk->sk_lock.owned = 1;
3569         __acquire(&sk->sk_lock.slock);
3570         spin_unlock_bh(&sk->sk_lock.slock);
3571         return true;
3572 }
3573 EXPORT_SYMBOL(__lock_sock_fast);
3574
3575 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3576                    bool timeval, bool time32)
3577 {
3578         struct sock *sk = sock->sk;
3579         struct timespec64 ts;
3580
3581         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3582         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3583         if (ts.tv_sec == -1)
3584                 return -ENOENT;
3585         if (ts.tv_sec == 0) {
3586                 ktime_t kt = ktime_get_real();
3587                 sock_write_timestamp(sk, kt);
3588                 ts = ktime_to_timespec64(kt);
3589         }
3590
3591         if (timeval)
3592                 ts.tv_nsec /= 1000;
3593
3594 #ifdef CONFIG_COMPAT_32BIT_TIME
3595         if (time32)
3596                 return put_old_timespec32(&ts, userstamp);
3597 #endif
3598 #ifdef CONFIG_SPARC64
3599         /* beware of padding in sparc64 timeval */
3600         if (timeval && !in_compat_syscall()) {
3601                 struct __kernel_old_timeval __user tv = {
3602                         .tv_sec = ts.tv_sec,
3603                         .tv_usec = ts.tv_nsec,
3604                 };
3605                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3606                         return -EFAULT;
3607                 return 0;
3608         }
3609 #endif
3610         return put_timespec64(&ts, userstamp);
3611 }
3612 EXPORT_SYMBOL(sock_gettstamp);
3613
3614 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3615 {
3616         if (!sock_flag(sk, flag)) {
3617                 unsigned long previous_flags = sk->sk_flags;
3618
3619                 sock_set_flag(sk, flag);
3620                 /*
3621                  * we just set one of the two flags which require net
3622                  * time stamping, but time stamping might have been on
3623                  * already because of the other one
3624                  */
3625                 if (sock_needs_netstamp(sk) &&
3626                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3627                         net_enable_timestamp();
3628         }
3629 }
3630
3631 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3632                        int level, int type)
3633 {
3634         struct sock_exterr_skb *serr;
3635         struct sk_buff *skb;
3636         int copied, err;
3637
3638         err = -EAGAIN;
3639         skb = sock_dequeue_err_skb(sk);
3640         if (skb == NULL)
3641                 goto out;
3642
3643         copied = skb->len;
3644         if (copied > len) {
3645                 msg->msg_flags |= MSG_TRUNC;
3646                 copied = len;
3647         }
3648         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3649         if (err)
3650                 goto out_free_skb;
3651
3652         sock_recv_timestamp(msg, sk, skb);
3653
3654         serr = SKB_EXT_ERR(skb);
3655         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3656
3657         msg->msg_flags |= MSG_ERRQUEUE;
3658         err = copied;
3659
3660 out_free_skb:
3661         kfree_skb(skb);
3662 out:
3663         return err;
3664 }
3665 EXPORT_SYMBOL(sock_recv_errqueue);
3666
3667 /*
3668  *      Get a socket option on an socket.
3669  *
3670  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3671  *      asynchronous errors should be reported by getsockopt. We assume
3672  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3673  */
3674 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3675                            char __user *optval, int __user *optlen)
3676 {
3677         struct sock *sk = sock->sk;
3678
3679         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3680         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3681 }
3682 EXPORT_SYMBOL(sock_common_getsockopt);
3683
3684 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3685                         int flags)
3686 {
3687         struct sock *sk = sock->sk;
3688         int addr_len = 0;
3689         int err;
3690
3691         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3692         if (err >= 0)
3693                 msg->msg_namelen = addr_len;
3694         return err;
3695 }
3696 EXPORT_SYMBOL(sock_common_recvmsg);
3697
3698 /*
3699  *      Set socket options on an inet socket.
3700  */
3701 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3702                            sockptr_t optval, unsigned int optlen)
3703 {
3704         struct sock *sk = sock->sk;
3705
3706         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3707         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3708 }
3709 EXPORT_SYMBOL(sock_common_setsockopt);
3710
3711 void sk_common_release(struct sock *sk)
3712 {
3713         if (sk->sk_prot->destroy)
3714                 sk->sk_prot->destroy(sk);
3715
3716         /*
3717          * Observation: when sk_common_release is called, processes have
3718          * no access to socket. But net still has.
3719          * Step one, detach it from networking:
3720          *
3721          * A. Remove from hash tables.
3722          */
3723
3724         sk->sk_prot->unhash(sk);
3725
3726         /*
3727          * In this point socket cannot receive new packets, but it is possible
3728          * that some packets are in flight because some CPU runs receiver and
3729          * did hash table lookup before we unhashed socket. They will achieve
3730          * receive queue and will be purged by socket destructor.
3731          *
3732          * Also we still have packets pending on receive queue and probably,
3733          * our own packets waiting in device queues. sock_destroy will drain
3734          * receive queue, but transmitted packets will delay socket destruction
3735          * until the last reference will be released.
3736          */
3737
3738         sock_orphan(sk);
3739
3740         xfrm_sk_free_policy(sk);
3741
3742         sock_put(sk);
3743 }
3744 EXPORT_SYMBOL(sk_common_release);
3745
3746 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3747 {
3748         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3749
3750         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3751         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3752         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3753         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3754         mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3755         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3756         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3757         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3758         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3759 }
3760
3761 #ifdef CONFIG_PROC_FS
3762 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3763
3764 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3765 {
3766         int cpu, idx = prot->inuse_idx;
3767         int res = 0;
3768
3769         for_each_possible_cpu(cpu)
3770                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3771
3772         return res >= 0 ? res : 0;
3773 }
3774 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3775
3776 int sock_inuse_get(struct net *net)
3777 {
3778         int cpu, res = 0;
3779
3780         for_each_possible_cpu(cpu)
3781                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3782
3783         return res;
3784 }
3785
3786 EXPORT_SYMBOL_GPL(sock_inuse_get);
3787
3788 static int __net_init sock_inuse_init_net(struct net *net)
3789 {
3790         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3791         if (net->core.prot_inuse == NULL)
3792                 return -ENOMEM;
3793         return 0;
3794 }
3795
3796 static void __net_exit sock_inuse_exit_net(struct net *net)
3797 {
3798         free_percpu(net->core.prot_inuse);
3799 }
3800
3801 static struct pernet_operations net_inuse_ops = {
3802         .init = sock_inuse_init_net,
3803         .exit = sock_inuse_exit_net,
3804 };
3805
3806 static __init int net_inuse_init(void)
3807 {
3808         if (register_pernet_subsys(&net_inuse_ops))
3809                 panic("Cannot initialize net inuse counters");
3810
3811         return 0;
3812 }
3813
3814 core_initcall(net_inuse_init);
3815
3816 static int assign_proto_idx(struct proto *prot)
3817 {
3818         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3819
3820         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3821                 pr_err("PROTO_INUSE_NR exhausted\n");
3822                 return -ENOSPC;
3823         }
3824
3825         set_bit(prot->inuse_idx, proto_inuse_idx);
3826         return 0;
3827 }
3828
3829 static void release_proto_idx(struct proto *prot)
3830 {
3831         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3832                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3833 }
3834 #else
3835 static inline int assign_proto_idx(struct proto *prot)
3836 {
3837         return 0;
3838 }
3839
3840 static inline void release_proto_idx(struct proto *prot)
3841 {
3842 }
3843
3844 #endif
3845
3846 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3847 {
3848         if (!twsk_prot)
3849                 return;
3850         kfree(twsk_prot->twsk_slab_name);
3851         twsk_prot->twsk_slab_name = NULL;
3852         kmem_cache_destroy(twsk_prot->twsk_slab);
3853         twsk_prot->twsk_slab = NULL;
3854 }
3855
3856 static int tw_prot_init(const struct proto *prot)
3857 {
3858         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3859
3860         if (!twsk_prot)
3861                 return 0;
3862
3863         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3864                                               prot->name);
3865         if (!twsk_prot->twsk_slab_name)
3866                 return -ENOMEM;
3867
3868         twsk_prot->twsk_slab =
3869                 kmem_cache_create(twsk_prot->twsk_slab_name,
3870                                   twsk_prot->twsk_obj_size, 0,
3871                                   SLAB_ACCOUNT | prot->slab_flags,
3872                                   NULL);
3873         if (!twsk_prot->twsk_slab) {
3874                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3875                         prot->name);
3876                 return -ENOMEM;
3877         }
3878
3879         return 0;
3880 }
3881
3882 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3883 {
3884         if (!rsk_prot)
3885                 return;
3886         kfree(rsk_prot->slab_name);
3887         rsk_prot->slab_name = NULL;
3888         kmem_cache_destroy(rsk_prot->slab);
3889         rsk_prot->slab = NULL;
3890 }
3891
3892 static int req_prot_init(const struct proto *prot)
3893 {
3894         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3895
3896         if (!rsk_prot)
3897                 return 0;
3898
3899         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3900                                         prot->name);
3901         if (!rsk_prot->slab_name)
3902                 return -ENOMEM;
3903
3904         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3905                                            rsk_prot->obj_size, 0,
3906                                            SLAB_ACCOUNT | prot->slab_flags,
3907                                            NULL);
3908
3909         if (!rsk_prot->slab) {
3910                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3911                         prot->name);
3912                 return -ENOMEM;
3913         }
3914         return 0;
3915 }
3916
3917 int proto_register(struct proto *prot, int alloc_slab)
3918 {
3919         int ret = -ENOBUFS;
3920
3921         if (prot->memory_allocated && !prot->sysctl_mem) {
3922                 pr_err("%s: missing sysctl_mem\n", prot->name);
3923                 return -EINVAL;
3924         }
3925         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3926                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3927                 return -EINVAL;
3928         }
3929         if (alloc_slab) {
3930                 prot->slab = kmem_cache_create_usercopy(prot->name,
3931                                         prot->obj_size, 0,
3932                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3933                                         prot->slab_flags,
3934                                         prot->useroffset, prot->usersize,
3935                                         NULL);
3936
3937                 if (prot->slab == NULL) {
3938                         pr_crit("%s: Can't create sock SLAB cache!\n",
3939                                 prot->name);
3940                         goto out;
3941                 }
3942
3943                 if (req_prot_init(prot))
3944                         goto out_free_request_sock_slab;
3945
3946                 if (tw_prot_init(prot))
3947                         goto out_free_timewait_sock_slab;
3948         }
3949
3950         mutex_lock(&proto_list_mutex);
3951         ret = assign_proto_idx(prot);
3952         if (ret) {
3953                 mutex_unlock(&proto_list_mutex);
3954                 goto out_free_timewait_sock_slab;
3955         }
3956         list_add(&prot->node, &proto_list);
3957         mutex_unlock(&proto_list_mutex);
3958         return ret;
3959
3960 out_free_timewait_sock_slab:
3961         if (alloc_slab)
3962                 tw_prot_cleanup(prot->twsk_prot);
3963 out_free_request_sock_slab:
3964         if (alloc_slab) {
3965                 req_prot_cleanup(prot->rsk_prot);
3966
3967                 kmem_cache_destroy(prot->slab);
3968                 prot->slab = NULL;
3969         }
3970 out:
3971         return ret;
3972 }
3973 EXPORT_SYMBOL(proto_register);
3974
3975 void proto_unregister(struct proto *prot)
3976 {
3977         mutex_lock(&proto_list_mutex);
3978         release_proto_idx(prot);
3979         list_del(&prot->node);
3980         mutex_unlock(&proto_list_mutex);
3981
3982         kmem_cache_destroy(prot->slab);
3983         prot->slab = NULL;
3984
3985         req_prot_cleanup(prot->rsk_prot);
3986         tw_prot_cleanup(prot->twsk_prot);
3987 }
3988 EXPORT_SYMBOL(proto_unregister);
3989
3990 int sock_load_diag_module(int family, int protocol)
3991 {
3992         if (!protocol) {
3993                 if (!sock_is_registered(family))
3994                         return -ENOENT;
3995
3996                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3997                                       NETLINK_SOCK_DIAG, family);
3998         }
3999
4000 #ifdef CONFIG_INET
4001         if (family == AF_INET &&
4002             protocol != IPPROTO_RAW &&
4003             protocol < MAX_INET_PROTOS &&
4004             !rcu_access_pointer(inet_protos[protocol]))
4005                 return -ENOENT;
4006 #endif
4007
4008         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4009                               NETLINK_SOCK_DIAG, family, protocol);
4010 }
4011 EXPORT_SYMBOL(sock_load_diag_module);
4012
4013 #ifdef CONFIG_PROC_FS
4014 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4015         __acquires(proto_list_mutex)
4016 {
4017         mutex_lock(&proto_list_mutex);
4018         return seq_list_start_head(&proto_list, *pos);
4019 }
4020
4021 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4022 {
4023         return seq_list_next(v, &proto_list, pos);
4024 }
4025
4026 static void proto_seq_stop(struct seq_file *seq, void *v)
4027         __releases(proto_list_mutex)
4028 {
4029         mutex_unlock(&proto_list_mutex);
4030 }
4031
4032 static char proto_method_implemented(const void *method)
4033 {
4034         return method == NULL ? 'n' : 'y';
4035 }
4036 static long sock_prot_memory_allocated(struct proto *proto)
4037 {
4038         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4039 }
4040
4041 static const char *sock_prot_memory_pressure(struct proto *proto)
4042 {
4043         return proto->memory_pressure != NULL ?
4044         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4045 }
4046
4047 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4048 {
4049
4050         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4051                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4052                    proto->name,
4053                    proto->obj_size,
4054                    sock_prot_inuse_get(seq_file_net(seq), proto),
4055                    sock_prot_memory_allocated(proto),
4056                    sock_prot_memory_pressure(proto),
4057                    proto->max_header,
4058                    proto->slab == NULL ? "no" : "yes",
4059                    module_name(proto->owner),
4060                    proto_method_implemented(proto->close),
4061                    proto_method_implemented(proto->connect),
4062                    proto_method_implemented(proto->disconnect),
4063                    proto_method_implemented(proto->accept),
4064                    proto_method_implemented(proto->ioctl),
4065                    proto_method_implemented(proto->init),
4066                    proto_method_implemented(proto->destroy),
4067                    proto_method_implemented(proto->shutdown),
4068                    proto_method_implemented(proto->setsockopt),
4069                    proto_method_implemented(proto->getsockopt),
4070                    proto_method_implemented(proto->sendmsg),
4071                    proto_method_implemented(proto->recvmsg),
4072                    proto_method_implemented(proto->bind),
4073                    proto_method_implemented(proto->backlog_rcv),
4074                    proto_method_implemented(proto->hash),
4075                    proto_method_implemented(proto->unhash),
4076                    proto_method_implemented(proto->get_port),
4077                    proto_method_implemented(proto->enter_memory_pressure));
4078 }
4079
4080 static int proto_seq_show(struct seq_file *seq, void *v)
4081 {
4082         if (v == &proto_list)
4083                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4084                            "protocol",
4085                            "size",
4086                            "sockets",
4087                            "memory",
4088                            "press",
4089                            "maxhdr",
4090                            "slab",
4091                            "module",
4092                            "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4093         else
4094                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4095         return 0;
4096 }
4097
4098 static const struct seq_operations proto_seq_ops = {
4099         .start  = proto_seq_start,
4100         .next   = proto_seq_next,
4101         .stop   = proto_seq_stop,
4102         .show   = proto_seq_show,
4103 };
4104
4105 static __net_init int proto_init_net(struct net *net)
4106 {
4107         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4108                         sizeof(struct seq_net_private)))
4109                 return -ENOMEM;
4110
4111         return 0;
4112 }
4113
4114 static __net_exit void proto_exit_net(struct net *net)
4115 {
4116         remove_proc_entry("protocols", net->proc_net);
4117 }
4118
4119
4120 static __net_initdata struct pernet_operations proto_net_ops = {
4121         .init = proto_init_net,
4122         .exit = proto_exit_net,
4123 };
4124
4125 static int __init proto_init(void)
4126 {
4127         return register_pernet_subsys(&proto_net_ops);
4128 }
4129
4130 subsys_initcall(proto_init);
4131
4132 #endif /* PROC_FS */
4133
4134 #ifdef CONFIG_NET_RX_BUSY_POLL
4135 bool sk_busy_loop_end(void *p, unsigned long start_time)
4136 {
4137         struct sock *sk = p;
4138
4139         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4140                sk_busy_loop_timeout(sk, start_time);
4141 }
4142 EXPORT_SYMBOL(sk_busy_loop_end);
4143 #endif /* CONFIG_NET_RX_BUSY_POLL */
4144
4145 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4146 {
4147         if (!sk->sk_prot->bind_add)
4148                 return -EOPNOTSUPP;
4149         return sk->sk_prot->bind_add(sk, addr, addr_len);
4150 }
4151 EXPORT_SYMBOL(sock_bind_add);
4152
4153 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4154 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4155                      void __user *arg, void *karg, size_t size)
4156 {
4157         int ret;
4158
4159         if (copy_from_user(karg, arg, size))
4160                 return -EFAULT;
4161
4162         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4163         if (ret)
4164                 return ret;
4165
4166         if (copy_to_user(arg, karg, size))
4167                 return -EFAULT;
4168
4169         return 0;
4170 }
4171 EXPORT_SYMBOL(sock_ioctl_inout);
4172
4173 /* This is the most common ioctl prep function, where the result (4 bytes) is
4174  * copied back to userspace if the ioctl() returns successfully. No input is
4175  * copied from userspace as input argument.
4176  */
4177 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4178 {
4179         int ret, karg = 0;
4180
4181         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4182         if (ret)
4183                 return ret;
4184
4185         return put_user(karg, (int __user *)arg);
4186 }
4187
4188 /* A wrapper around sock ioctls, which copies the data from userspace
4189  * (depending on the protocol/ioctl), and copies back the result to userspace.
4190  * The main motivation for this function is to pass kernel memory to the
4191  * protocol ioctl callbacks, instead of userspace memory.
4192  */
4193 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4194 {
4195         int rc = 1;
4196
4197         if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4198                 rc = ipmr_sk_ioctl(sk, cmd, arg);
4199         else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4200                 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4201         else if (sk_is_phonet(sk))
4202                 rc = phonet_sk_ioctl(sk, cmd, arg);
4203
4204         /* If ioctl was processed, returns its value */
4205         if (rc <= 0)
4206                 return rc;
4207
4208         /* Otherwise call the default handler */
4209         return sock_ioctl_out(sk, cmd, arg);
4210 }
4211 EXPORT_SYMBOL(sk_ioctl);