net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117 #include <linux/mroute.h>
 118 #include <linux/mroute6.h>
 119 #include <linux/icmpv6.h>
 120
 121 #include <linux/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138 #include <net/bpf_sk_storage.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144 #include <net/phonet/phonet.h>
 145
 146 #include <linux/ethtool.h>
 147
 148 #include "dev.h"
 149
 150 static DEFINE_MUTEX(proto_list_mutex);
 151 static LIST_HEAD(proto_list);
 152
 153 static void sock_def_write_space_wfree(struct sock *sk);
 154 static void sock_def_write_space(struct sock *sk);
 155
 156 /**
 157  * sk_ns_capable - General socket capability test
 158  * @sk: Socket to use a capability on or through
 159  * @user_ns: The user namespace of the capability to use
 160  * @cap: The capability to use
 161  *
 162  * Test to see if the opener of the socket had when the socket was
 163  * created and the current process has the capability @cap in the user
 164  * namespace @user_ns.
 165  */
 166 bool sk_ns_capable(const struct sock *sk,
 167                    struct user_namespace *user_ns, int cap)
 168 {
 169         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 170                 ns_capable(user_ns, cap);
 171 }
 172 EXPORT_SYMBOL(sk_ns_capable);
 173
 174 /**
 175  * sk_capable - Socket global capability test
 176  * @sk: Socket to use a capability on or through
 177  * @cap: The global capability to use
 178  *
 179  * Test to see if the opener of the socket had when the socket was
 180  * created and the current process has the capability @cap in all user
 181  * namespaces.
 182  */
 183 bool sk_capable(const struct sock *sk, int cap)
 184 {
 185         return sk_ns_capable(sk, &init_user_ns, cap);
 186 }
 187 EXPORT_SYMBOL(sk_capable);
 188
 189 /**
 190  * sk_net_capable - Network namespace socket capability test
 191  * @sk: Socket to use a capability on or through
 192  * @cap: The capability to use
 193  *
 194  * Test to see if the opener of the socket had when the socket was created
 195  * and the current process has the capability @cap over the network namespace
 196  * the socket is a member of.
 197  */
 198 bool sk_net_capable(const struct sock *sk, int cap)
 199 {
 200         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 201 }
 202 EXPORT_SYMBOL(sk_net_capable);
 203
 204 /*
 205  * Each address family might have different locking rules, so we have
 206  * one slock key per address family and separate keys for internal and
 207  * userspace sockets.
 208  */
 209 static struct lock_class_key af_family_keys[AF_MAX];
 210 static struct lock_class_key af_family_kern_keys[AF_MAX];
 211 static struct lock_class_key af_family_slock_keys[AF_MAX];
 212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 213
 214 /*
 215  * Make lock validator output more readable. (we pre-construct these
 216  * strings build-time, so that runtime initialization of socket
 217  * locks is fast):
 218  */
 219
 220 #define _sock_locks(x)                                            \
 221   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 222   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 223   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 224   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 225   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 226   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 227   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 228   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 229   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 230   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 231   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 232   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 233   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 234   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 235   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 236   x "AF_MCTP"  , \
 237   x "AF_MAX"
 238
 239 static const char *const af_family_key_strings[AF_MAX+1] = {
 240         _sock_locks("sk_lock-")
 241 };
 242 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("slock-")
 244 };
 245 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("clock-")
 247 };
 248
 249 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-sk_lock-")
 251 };
 252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 253         _sock_locks("k-slock-")
 254 };
 255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 256         _sock_locks("k-clock-")
 257 };
 258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 259         _sock_locks("rlock-")
 260 };
 261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 262         _sock_locks("wlock-")
 263 };
 264 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 265         _sock_locks("elock-")
 266 };
 267
 268 /*
 269  * sk_callback_lock and sk queues locking rules are per-address-family,
 270  * so split the lock classes by using a per-AF key:
 271  */
 272 static struct lock_class_key af_callback_keys[AF_MAX];
 273 static struct lock_class_key af_rlock_keys[AF_MAX];
 274 static struct lock_class_key af_wlock_keys[AF_MAX];
 275 static struct lock_class_key af_elock_keys[AF_MAX];
 276 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 277
 278 /* Run time adjustable parameters. */
 279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280 EXPORT_SYMBOL(sysctl_wmem_max);
 281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282 EXPORT_SYMBOL(sysctl_rmem_max);
 283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286 /* Maximal space eaten by iovec or ancillary data plus some space */
 287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288 EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290 int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 293 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 294
 295 /**
 296  * sk_set_memalloc - sets %SOCK_MEMALLOC
 297  * @sk: socket to set it on
 298  *
 299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300  * It's the responsibility of the admin to adjust min_free_kbytes
 301  * to meet the requirements
 302  */
 303 void sk_set_memalloc(struct sock *sk)
 304 {
 305         sock_set_flag(sk, SOCK_MEMALLOC);
 306         sk->sk_allocation |= __GFP_MEMALLOC;
 307         static_branch_inc(&memalloc_socks_key);
 308 }
 309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311 void sk_clear_memalloc(struct sock *sk)
 312 {
 313         sock_reset_flag(sk, SOCK_MEMALLOC);
 314         sk->sk_allocation &= ~__GFP_MEMALLOC;
 315         static_branch_dec(&memalloc_socks_key);
 316
 317         /*
 318          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319          * progress of swapping. SOCK_MEMALLOC may be cleared while
 320          * it has rmem allocations due to the last swapfile being deactivated
 321          * but there is a risk that the socket is unusable due to exceeding
 322          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323          */
 324         sk_mem_reclaim(sk);
 325 }
 326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329 {
 330         int ret;
 331         unsigned int noreclaim_flag;
 332
 333         /* these should have been dropped before queueing */
 334         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336         noreclaim_flag = memalloc_noreclaim_save();
 337         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 338                                  tcp_v6_do_rcv,
 339                                  tcp_v4_do_rcv,
 340                                  sk, skb);
 341         memalloc_noreclaim_restore(noreclaim_flag);
 342
 343         return ret;
 344 }
 345 EXPORT_SYMBOL(__sk_backlog_rcv);
 346
 347 void sk_error_report(struct sock *sk)
 348 {
 349         sk->sk_error_report(sk);
 350
 351         switch (sk->sk_family) {
 352         case AF_INET:
 353                 fallthrough;
 354         case AF_INET6:
 355                 trace_inet_sk_error_report(sk);
 356                 break;
 357         default:
 358                 break;
 359         }
 360 }
 361 EXPORT_SYMBOL(sk_error_report);
 362
 363 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 364 {
 365         struct __kernel_sock_timeval tv;
 366
 367         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 368                 tv.tv_sec = 0;
 369                 tv.tv_usec = 0;
 370         } else {
 371                 tv.tv_sec = timeo / HZ;
 372                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 373         }
 374
 375         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 376                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 377                 *(struct old_timeval32 *)optval = tv32;
 378                 return sizeof(tv32);
 379         }
 380
 381         if (old_timeval) {
 382                 struct __kernel_old_timeval old_tv;
 383                 old_tv.tv_sec = tv.tv_sec;
 384                 old_tv.tv_usec = tv.tv_usec;
 385                 *(struct __kernel_old_timeval *)optval = old_tv;
 386                 return sizeof(old_tv);
 387         }
 388
 389         *(struct __kernel_sock_timeval *)optval = tv;
 390         return sizeof(tv);
 391 }
 392 EXPORT_SYMBOL(sock_get_timeout);
 393
 394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 395                            sockptr_t optval, int optlen, bool old_timeval)
 396 {
 397         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 398                 struct old_timeval32 tv32;
 399
 400                 if (optlen < sizeof(tv32))
 401                         return -EINVAL;
 402
 403                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 404                         return -EFAULT;
 405                 tv->tv_sec = tv32.tv_sec;
 406                 tv->tv_usec = tv32.tv_usec;
 407         } else if (old_timeval) {
 408                 struct __kernel_old_timeval old_tv;
 409
 410                 if (optlen < sizeof(old_tv))
 411                         return -EINVAL;
 412                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 413                         return -EFAULT;
 414                 tv->tv_sec = old_tv.tv_sec;
 415                 tv->tv_usec = old_tv.tv_usec;
 416         } else {
 417                 if (optlen < sizeof(*tv))
 418                         return -EINVAL;
 419                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 420                         return -EFAULT;
 421         }
 422
 423         return 0;
 424 }
 425 EXPORT_SYMBOL(sock_copy_user_timeval);
 426
 427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 428                             bool old_timeval)
 429 {
 430         struct __kernel_sock_timeval tv;
 431         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 432
 433         if (err)
 434                 return err;
 435
 436         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 437                 return -EDOM;
 438
 439         if (tv.tv_sec < 0) {
 440                 static int warned __read_mostly;
 441
 442                 *timeo_p = 0;
 443                 if (warned < 10 && net_ratelimit()) {
 444                         warned++;
 445                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 446                                 __func__, current->comm, task_pid_nr(current));
 447                 }
 448                 return 0;
 449         }
 450         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 451         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 452                 return 0;
 453         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 454                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 455         return 0;
 456 }
 457
 458 static bool sock_needs_netstamp(const struct sock *sk)
 459 {
 460         switch (sk->sk_family) {
 461         case AF_UNSPEC:
 462         case AF_UNIX:
 463                 return false;
 464         default:
 465                 return true;
 466         }
 467 }
 468
 469 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 470 {
 471         if (sk->sk_flags & flags) {
 472                 sk->sk_flags &= ~flags;
 473                 if (sock_needs_netstamp(sk) &&
 474                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 475                         net_disable_timestamp();
 476         }
 477 }
 478
 479
 480 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 481 {
 482         unsigned long flags;
 483         struct sk_buff_head *list = &sk->sk_receive_queue;
 484
 485         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 486                 atomic_inc(&sk->sk_drops);
 487                 trace_sock_rcvqueue_full(sk, skb);
 488                 return -ENOMEM;
 489         }
 490
 491         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 492                 atomic_inc(&sk->sk_drops);
 493                 return -ENOBUFS;
 494         }
 495
 496         skb->dev = NULL;
 497         skb_set_owner_r(skb, sk);
 498
 499         /* we escape from rcu protected region, make sure we dont leak
 500          * a norefcounted dst
 501          */
 502         skb_dst_force(skb);
 503
 504         spin_lock_irqsave(&list->lock, flags);
 505         sock_skb_set_dropcount(sk, skb);
 506         __skb_queue_tail(list, skb);
 507         spin_unlock_irqrestore(&list->lock, flags);
 508
 509         if (!sock_flag(sk, SOCK_DEAD))
 510                 sk->sk_data_ready(sk);
 511         return 0;
 512 }
 513 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 514
 515 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 516                               enum skb_drop_reason *reason)
 517 {
 518         enum skb_drop_reason drop_reason;
 519         int err;
 520
 521         err = sk_filter(sk, skb);
 522         if (err) {
 523                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 524                 goto out;
 525         }
 526         err = __sock_queue_rcv_skb(sk, skb);
 527         switch (err) {
 528         case -ENOMEM:
 529                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 530                 break;
 531         case -ENOBUFS:
 532                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 533                 break;
 534         default:
 535                 drop_reason = SKB_NOT_DROPPED_YET;
 536                 break;
 537         }
 538 out:
 539         if (reason)
 540                 *reason = drop_reason;
 541         return err;
 542 }
 543 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 544
 545 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 546                      const int nested, unsigned int trim_cap, bool refcounted)
 547 {
 548         int rc = NET_RX_SUCCESS;
 549
 550         if (sk_filter_trim_cap(sk, skb, trim_cap))
 551                 goto discard_and_relse;
 552
 553         skb->dev = NULL;
 554
 555         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 556                 atomic_inc(&sk->sk_drops);
 557                 goto discard_and_relse;
 558         }
 559         if (nested)
 560                 bh_lock_sock_nested(sk);
 561         else
 562                 bh_lock_sock(sk);
 563         if (!sock_owned_by_user(sk)) {
 564                 /*
 565                  * trylock + unlock semantics:
 566                  */
 567                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 568
 569                 rc = sk_backlog_rcv(sk, skb);
 570
 571                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 572         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 573                 bh_unlock_sock(sk);
 574                 atomic_inc(&sk->sk_drops);
 575                 goto discard_and_relse;
 576         }
 577
 578         bh_unlock_sock(sk);
 579 out:
 580         if (refcounted)
 581                 sock_put(sk);
 582         return rc;
 583 discard_and_relse:
 584         kfree_skb(skb);
 585         goto out;
 586 }
 587 EXPORT_SYMBOL(__sk_receive_skb);
 588
 589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 590                                                           u32));
 591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 592                                                            u32));
 593 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 594 {
 595         struct dst_entry *dst = __sk_dst_get(sk);
 596
 597         if (dst && dst->obsolete &&
 598             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 599                                dst, cookie) == NULL) {
 600                 sk_tx_queue_clear(sk);
 601                 sk->sk_dst_pending_confirm = 0;
 602                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 603                 dst_release(dst);
 604                 return NULL;
 605         }
 606
 607         return dst;
 608 }
 609 EXPORT_SYMBOL(__sk_dst_check);
 610
 611 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 612 {
 613         struct dst_entry *dst = sk_dst_get(sk);
 614
 615         if (dst && dst->obsolete &&
 616             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 617                                dst, cookie) == NULL) {
 618                 sk_dst_reset(sk);
 619                 dst_release(dst);
 620                 return NULL;
 621         }
 622
 623         return dst;
 624 }
 625 EXPORT_SYMBOL(sk_dst_check);
 626
 627 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 628 {
 629         int ret = -ENOPROTOOPT;
 630 #ifdef CONFIG_NETDEVICES
 631         struct net *net = sock_net(sk);
 632
 633         /* Sorry... */
 634         ret = -EPERM;
 635         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 636                 goto out;
 637
 638         ret = -EINVAL;
 639         if (ifindex < 0)
 640                 goto out;
 641
 642         /* Paired with all READ_ONCE() done locklessly. */
 643         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 644
 645         if (sk->sk_prot->rehash)
 646                 sk->sk_prot->rehash(sk);
 647         sk_dst_reset(sk);
 648
 649         ret = 0;
 650
 651 out:
 652 #endif
 653
 654         return ret;
 655 }
 656
 657 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 658 {
 659         int ret;
 660
 661         if (lock_sk)
 662                 lock_sock(sk);
 663         ret = sock_bindtoindex_locked(sk, ifindex);
 664         if (lock_sk)
 665                 release_sock(sk);
 666
 667         return ret;
 668 }
 669 EXPORT_SYMBOL(sock_bindtoindex);
 670
 671 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 672 {
 673         int ret = -ENOPROTOOPT;
 674 #ifdef CONFIG_NETDEVICES
 675         struct net *net = sock_net(sk);
 676         char devname[IFNAMSIZ];
 677         int index;
 678
 679         ret = -EINVAL;
 680         if (optlen < 0)
 681                 goto out;
 682
 683         /* Bind this socket to a particular device like "eth0",
 684          * as specified in the passed interface name. If the
 685          * name is "" or the option length is zero the socket
 686          * is not bound.
 687          */
 688         if (optlen > IFNAMSIZ - 1)
 689                 optlen = IFNAMSIZ - 1;
 690         memset(devname, 0, sizeof(devname));
 691
 692         ret = -EFAULT;
 693         if (copy_from_sockptr(devname, optval, optlen))
 694                 goto out;
 695
 696         index = 0;
 697         if (devname[0] != '\0') {
 698                 struct net_device *dev;
 699
 700                 rcu_read_lock();
 701                 dev = dev_get_by_name_rcu(net, devname);
 702                 if (dev)
 703                         index = dev->ifindex;
 704                 rcu_read_unlock();
 705                 ret = -ENODEV;
 706                 if (!dev)
 707                         goto out;
 708         }
 709
 710         sockopt_lock_sock(sk);
 711         ret = sock_bindtoindex_locked(sk, index);
 712         sockopt_release_sock(sk);
 713 out:
 714 #endif
 715
 716         return ret;
 717 }
 718
 719 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 720                                 sockptr_t optlen, int len)
 721 {
 722         int ret = -ENOPROTOOPT;
 723 #ifdef CONFIG_NETDEVICES
 724         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 725         struct net *net = sock_net(sk);
 726         char devname[IFNAMSIZ];
 727
 728         if (bound_dev_if == 0) {
 729                 len = 0;
 730                 goto zero;
 731         }
 732
 733         ret = -EINVAL;
 734         if (len < IFNAMSIZ)
 735                 goto out;
 736
 737         ret = netdev_get_name(net, devname, bound_dev_if);
 738         if (ret)
 739                 goto out;
 740
 741         len = strlen(devname) + 1;
 742
 743         ret = -EFAULT;
 744         if (copy_to_sockptr(optval, devname, len))
 745                 goto out;
 746
 747 zero:
 748         ret = -EFAULT;
 749         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 750                 goto out;
 751
 752         ret = 0;
 753
 754 out:
 755 #endif
 756
 757         return ret;
 758 }
 759
 760 bool sk_mc_loop(struct sock *sk)
 761 {
 762         if (dev_recursion_level())
 763                 return false;
 764         if (!sk)
 765                 return true;
 766         switch (sk->sk_family) {
 767         case AF_INET:
 768                 return inet_sk(sk)->mc_loop;
 769 #if IS_ENABLED(CONFIG_IPV6)
 770         case AF_INET6:
 771                 return inet6_sk(sk)->mc_loop;
 772 #endif
 773         }
 774         WARN_ON_ONCE(1);
 775         return true;
 776 }
 777 EXPORT_SYMBOL(sk_mc_loop);
 778
 779 void sock_set_reuseaddr(struct sock *sk)
 780 {
 781         lock_sock(sk);
 782         sk->sk_reuse = SK_CAN_REUSE;
 783         release_sock(sk);
 784 }
 785 EXPORT_SYMBOL(sock_set_reuseaddr);
 786
 787 void sock_set_reuseport(struct sock *sk)
 788 {
 789         lock_sock(sk);
 790         sk->sk_reuseport = true;
 791         release_sock(sk);
 792 }
 793 EXPORT_SYMBOL(sock_set_reuseport);
 794
 795 void sock_no_linger(struct sock *sk)
 796 {
 797         lock_sock(sk);
 798         sk->sk_lingertime = 0;
 799         sock_set_flag(sk, SOCK_LINGER);
 800         release_sock(sk);
 801 }
 802 EXPORT_SYMBOL(sock_no_linger);
 803
 804 void sock_set_priority(struct sock *sk, u32 priority)
 805 {
 806         lock_sock(sk);
 807         sk->sk_priority = priority;
 808         release_sock(sk);
 809 }
 810 EXPORT_SYMBOL(sock_set_priority);
 811
 812 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 813 {
 814         lock_sock(sk);
 815         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 816                 sk->sk_sndtimeo = secs * HZ;
 817         else
 818                 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 819         release_sock(sk);
 820 }
 821 EXPORT_SYMBOL(sock_set_sndtimeo);
 822
 823 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 824 {
 825         if (val)  {
 826                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 827                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 828                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 829                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 830         } else {
 831                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 832                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 833         }
 834 }
 835
 836 void sock_enable_timestamps(struct sock *sk)
 837 {
 838         lock_sock(sk);
 839         __sock_set_timestamps(sk, true, false, true);
 840         release_sock(sk);
 841 }
 842 EXPORT_SYMBOL(sock_enable_timestamps);
 843
 844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 845 {
 846         switch (optname) {
 847         case SO_TIMESTAMP_OLD:
 848                 __sock_set_timestamps(sk, valbool, false, false);
 849                 break;
 850         case SO_TIMESTAMP_NEW:
 851                 __sock_set_timestamps(sk, valbool, true, false);
 852                 break;
 853         case SO_TIMESTAMPNS_OLD:
 854                 __sock_set_timestamps(sk, valbool, false, true);
 855                 break;
 856         case SO_TIMESTAMPNS_NEW:
 857                 __sock_set_timestamps(sk, valbool, true, true);
 858                 break;
 859         }
 860 }
 861
 862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 863 {
 864         struct net *net = sock_net(sk);
 865         struct net_device *dev = NULL;
 866         bool match = false;
 867         int *vclock_index;
 868         int i, num;
 869
 870         if (sk->sk_bound_dev_if)
 871                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 872
 873         if (!dev) {
 874                 pr_err("%s: sock not bind to device\n", __func__);
 875                 return -EOPNOTSUPP;
 876         }
 877
 878         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 879         dev_put(dev);
 880
 881         for (i = 0; i < num; i++) {
 882                 if (*(vclock_index + i) == phc_index) {
 883                         match = true;
 884                         break;
 885                 }
 886         }
 887
 888         if (num > 0)
 889                 kfree(vclock_index);
 890
 891         if (!match)
 892                 return -EINVAL;
 893
 894         sk->sk_bind_phc = phc_index;
 895
 896         return 0;
 897 }
 898
 899 int sock_set_timestamping(struct sock *sk, int optname,
 900                           struct so_timestamping timestamping)
 901 {
 902         int val = timestamping.flags;
 903         int ret;
 904
 905         if (val & ~SOF_TIMESTAMPING_MASK)
 906                 return -EINVAL;
 907
 908         if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 909             !(val & SOF_TIMESTAMPING_OPT_ID))
 910                 return -EINVAL;
 911
 912         if (val & SOF_TIMESTAMPING_OPT_ID &&
 913             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 914                 if (sk_is_tcp(sk)) {
 915                         if ((1 << sk->sk_state) &
 916                             (TCPF_CLOSE | TCPF_LISTEN))
 917                                 return -EINVAL;
 918                         if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 919                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 920                         else
 921                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 922                 } else {
 923                         atomic_set(&sk->sk_tskey, 0);
 924                 }
 925         }
 926
 927         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 928             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 929                 return -EINVAL;
 930
 931         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 932                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 933                 if (ret)
 934                         return ret;
 935         }
 936
 937         sk->sk_tsflags = val;
 938         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 939
 940         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 941                 sock_enable_timestamp(sk,
 942                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 943         else
 944                 sock_disable_timestamp(sk,
 945                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 946         return 0;
 947 }
 948
 949 void sock_set_keepalive(struct sock *sk)
 950 {
 951         lock_sock(sk);
 952         if (sk->sk_prot->keepalive)
 953                 sk->sk_prot->keepalive(sk, true);
 954         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 955         release_sock(sk);
 956 }
 957 EXPORT_SYMBOL(sock_set_keepalive);
 958
 959 static void __sock_set_rcvbuf(struct sock *sk, int val)
 960 {
 961         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 962          * as a negative value.
 963          */
 964         val = min_t(int, val, INT_MAX / 2);
 965         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 966
 967         /* We double it on the way in to account for "struct sk_buff" etc.
 968          * overhead.   Applications assume that the SO_RCVBUF setting they make
 969          * will allow that much actual data to be received on that socket.
 970          *
 971          * Applications are unaware that "struct sk_buff" and other overheads
 972          * allocate from the receive buffer during socket buffer allocation.
 973          *
 974          * And after considering the possible alternatives, returning the value
 975          * we actually used in getsockopt is the most desirable behavior.
 976          */
 977         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 978 }
 979
 980 void sock_set_rcvbuf(struct sock *sk, int val)
 981 {
 982         lock_sock(sk);
 983         __sock_set_rcvbuf(sk, val);
 984         release_sock(sk);
 985 }
 986 EXPORT_SYMBOL(sock_set_rcvbuf);
 987
 988 static void __sock_set_mark(struct sock *sk, u32 val)
 989 {
 990         if (val != sk->sk_mark) {
 991                 sk->sk_mark = val;
 992                 sk_dst_reset(sk);
 993         }
 994 }
 995
 996 void sock_set_mark(struct sock *sk, u32 val)
 997 {
 998         lock_sock(sk);
 999         __sock_set_mark(sk, val);
1000         release_sock(sk);
1001 }
1002 EXPORT_SYMBOL(sock_set_mark);
1003
1004 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1005 {
1006         /* Round down bytes to multiple of pages */
1007         bytes = round_down(bytes, PAGE_SIZE);
1008
1009         WARN_ON(bytes > sk->sk_reserved_mem);
1010         sk->sk_reserved_mem -= bytes;
1011         sk_mem_reclaim(sk);
1012 }
1013
1014 static int sock_reserve_memory(struct sock *sk, int bytes)
1015 {
1016         long allocated;
1017         bool charged;
1018         int pages;
1019
1020         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1021                 return -EOPNOTSUPP;
1022
1023         if (!bytes)
1024                 return 0;
1025
1026         pages = sk_mem_pages(bytes);
1027
1028         /* pre-charge to memcg */
1029         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1030                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1031         if (!charged)
1032                 return -ENOMEM;
1033
1034         /* pre-charge to forward_alloc */
1035         sk_memory_allocated_add(sk, pages);
1036         allocated = sk_memory_allocated(sk);
1037         /* If the system goes into memory pressure with this
1038          * precharge, give up and return error.
1039          */
1040         if (allocated > sk_prot_mem_limits(sk, 1)) {
1041                 sk_memory_allocated_sub(sk, pages);
1042                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1043                 return -ENOMEM;
1044         }
1045         sk->sk_forward_alloc += pages << PAGE_SHIFT;
1046
1047         sk->sk_reserved_mem += pages << PAGE_SHIFT;
1048
1049         return 0;
1050 }
1051
1052 void sockopt_lock_sock(struct sock *sk)
1053 {
1054         /* When current->bpf_ctx is set, the setsockopt is called from
1055          * a bpf prog.  bpf has ensured the sk lock has been
1056          * acquired before calling setsockopt().
1057          */
1058         if (has_current_bpf_ctx())
1059                 return;
1060
1061         lock_sock(sk);
1062 }
1063 EXPORT_SYMBOL(sockopt_lock_sock);
1064
1065 void sockopt_release_sock(struct sock *sk)
1066 {
1067         if (has_current_bpf_ctx())
1068                 return;
1069
1070         release_sock(sk);
1071 }
1072 EXPORT_SYMBOL(sockopt_release_sock);
1073
1074 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1075 {
1076         return has_current_bpf_ctx() || ns_capable(ns, cap);
1077 }
1078 EXPORT_SYMBOL(sockopt_ns_capable);
1079
1080 bool sockopt_capable(int cap)
1081 {
1082         return has_current_bpf_ctx() || capable(cap);
1083 }
1084 EXPORT_SYMBOL(sockopt_capable);
1085
1086 /*
1087  *      This is meant for all protocols to use and covers goings on
1088  *      at the socket level. Everything here is generic.
1089  */
1090
1091 int sk_setsockopt(struct sock *sk, int level, int optname,
1092                   sockptr_t optval, unsigned int optlen)
1093 {
1094         struct so_timestamping timestamping;
1095         struct socket *sock = sk->sk_socket;
1096         struct sock_txtime sk_txtime;
1097         int val;
1098         int valbool;
1099         struct linger ling;
1100         int ret = 0;
1101
1102         /*
1103          *      Options without arguments
1104          */
1105
1106         if (optname == SO_BINDTODEVICE)
1107                 return sock_setbindtodevice(sk, optval, optlen);
1108
1109         if (optlen < sizeof(int))
1110                 return -EINVAL;
1111
1112         if (copy_from_sockptr(&val, optval, sizeof(val)))
1113                 return -EFAULT;
1114
1115         valbool = val ? 1 : 0;
1116
1117         sockopt_lock_sock(sk);
1118
1119         switch (optname) {
1120         case SO_DEBUG:
1121                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1122                         ret = -EACCES;
1123                 else
1124                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1125                 break;
1126         case SO_REUSEADDR:
1127                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1128                 break;
1129         case SO_REUSEPORT:
1130                 sk->sk_reuseport = valbool;
1131                 break;
1132         case SO_TYPE:
1133         case SO_PROTOCOL:
1134         case SO_DOMAIN:
1135         case SO_ERROR:
1136                 ret = -ENOPROTOOPT;
1137                 break;
1138         case SO_DONTROUTE:
1139                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1140                 sk_dst_reset(sk);
1141                 break;
1142         case SO_BROADCAST:
1143                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1144                 break;
1145         case SO_SNDBUF:
1146                 /* Don't error on this BSD doesn't and if you think
1147                  * about it this is right. Otherwise apps have to
1148                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1149                  * are treated in BSD as hints
1150                  */
1151                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1152 set_sndbuf:
1153                 /* Ensure val * 2 fits into an int, to prevent max_t()
1154                  * from treating it as a negative value.
1155                  */
1156                 val = min_t(int, val, INT_MAX / 2);
1157                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1158                 WRITE_ONCE(sk->sk_sndbuf,
1159                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1160                 /* Wake up sending tasks if we upped the value. */
1161                 sk->sk_write_space(sk);
1162                 break;
1163
1164         case SO_SNDBUFFORCE:
1165                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1166                         ret = -EPERM;
1167                         break;
1168                 }
1169
1170                 /* No negative values (to prevent underflow, as val will be
1171                  * multiplied by 2).
1172                  */
1173                 if (val < 0)
1174                         val = 0;
1175                 goto set_sndbuf;
1176
1177         case SO_RCVBUF:
1178                 /* Don't error on this BSD doesn't and if you think
1179                  * about it this is right. Otherwise apps have to
1180                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1181                  * are treated in BSD as hints
1182                  */
1183                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1184                 break;
1185
1186         case SO_RCVBUFFORCE:
1187                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1188                         ret = -EPERM;
1189                         break;
1190                 }
1191
1192                 /* No negative values (to prevent underflow, as val will be
1193                  * multiplied by 2).
1194                  */
1195                 __sock_set_rcvbuf(sk, max(val, 0));
1196                 break;
1197
1198         case SO_KEEPALIVE:
1199                 if (sk->sk_prot->keepalive)
1200                         sk->sk_prot->keepalive(sk, valbool);
1201                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1202                 break;
1203
1204         case SO_OOBINLINE:
1205                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1206                 break;
1207
1208         case SO_NO_CHECK:
1209                 sk->sk_no_check_tx = valbool;
1210                 break;
1211
1212         case SO_PRIORITY:
1213                 if ((val >= 0 && val <= 6) ||
1214                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1215                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1216                         sk->sk_priority = val;
1217                 else
1218                         ret = -EPERM;
1219                 break;
1220
1221         case SO_LINGER:
1222                 if (optlen < sizeof(ling)) {
1223                         ret = -EINVAL;  /* 1003.1g */
1224                         break;
1225                 }
1226                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1227                         ret = -EFAULT;
1228                         break;
1229                 }
1230                 if (!ling.l_onoff)
1231                         sock_reset_flag(sk, SOCK_LINGER);
1232                 else {
1233 #if (BITS_PER_LONG == 32)
1234                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1235                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1236                         else
1237 #endif
1238                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1239                         sock_set_flag(sk, SOCK_LINGER);
1240                 }
1241                 break;
1242
1243         case SO_BSDCOMPAT:
1244                 break;
1245
1246         case SO_PASSCRED:
1247                 if (valbool)
1248                         set_bit(SOCK_PASSCRED, &sock->flags);
1249                 else
1250                         clear_bit(SOCK_PASSCRED, &sock->flags);
1251                 break;
1252
1253         case SO_PASSPIDFD:
1254                 if (valbool)
1255                         set_bit(SOCK_PASSPIDFD, &sock->flags);
1256                 else
1257                         clear_bit(SOCK_PASSPIDFD, &sock->flags);
1258                 break;
1259
1260         case SO_TIMESTAMP_OLD:
1261         case SO_TIMESTAMP_NEW:
1262         case SO_TIMESTAMPNS_OLD:
1263         case SO_TIMESTAMPNS_NEW:
1264                 sock_set_timestamp(sk, optname, valbool);
1265                 break;
1266
1267         case SO_TIMESTAMPING_NEW:
1268         case SO_TIMESTAMPING_OLD:
1269                 if (optlen == sizeof(timestamping)) {
1270                         if (copy_from_sockptr(&timestamping, optval,
1271                                               sizeof(timestamping))) {
1272                                 ret = -EFAULT;
1273                                 break;
1274                         }
1275                 } else {
1276                         memset(&timestamping, 0, sizeof(timestamping));
1277                         timestamping.flags = val;
1278                 }
1279                 ret = sock_set_timestamping(sk, optname, timestamping);
1280                 break;
1281
1282         case SO_RCVLOWAT:
1283                 if (val < 0)
1284                         val = INT_MAX;
1285                 if (sock && sock->ops->set_rcvlowat)
1286                         ret = sock->ops->set_rcvlowat(sk, val);
1287                 else
1288                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1289                 break;
1290
1291         case SO_RCVTIMEO_OLD:
1292         case SO_RCVTIMEO_NEW:
1293                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1294                                        optlen, optname == SO_RCVTIMEO_OLD);
1295                 break;
1296
1297         case SO_SNDTIMEO_OLD:
1298         case SO_SNDTIMEO_NEW:
1299                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1300                                        optlen, optname == SO_SNDTIMEO_OLD);
1301                 break;
1302
1303         case SO_ATTACH_FILTER: {
1304                 struct sock_fprog fprog;
1305
1306                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1307                 if (!ret)
1308                         ret = sk_attach_filter(&fprog, sk);
1309                 break;
1310         }
1311         case SO_ATTACH_BPF:
1312                 ret = -EINVAL;
1313                 if (optlen == sizeof(u32)) {
1314                         u32 ufd;
1315
1316                         ret = -EFAULT;
1317                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1318                                 break;
1319
1320                         ret = sk_attach_bpf(ufd, sk);
1321                 }
1322                 break;
1323
1324         case SO_ATTACH_REUSEPORT_CBPF: {
1325                 struct sock_fprog fprog;
1326
1327                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1328                 if (!ret)
1329                         ret = sk_reuseport_attach_filter(&fprog, sk);
1330                 break;
1331         }
1332         case SO_ATTACH_REUSEPORT_EBPF:
1333                 ret = -EINVAL;
1334                 if (optlen == sizeof(u32)) {
1335                         u32 ufd;
1336
1337                         ret = -EFAULT;
1338                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1339                                 break;
1340
1341                         ret = sk_reuseport_attach_bpf(ufd, sk);
1342                 }
1343                 break;
1344
1345         case SO_DETACH_REUSEPORT_BPF:
1346                 ret = reuseport_detach_prog(sk);
1347                 break;
1348
1349         case SO_DETACH_FILTER:
1350                 ret = sk_detach_filter(sk);
1351                 break;
1352
1353         case SO_LOCK_FILTER:
1354                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1355                         ret = -EPERM;
1356                 else
1357                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1358                 break;
1359
1360         case SO_PASSSEC:
1361                 if (valbool)
1362                         set_bit(SOCK_PASSSEC, &sock->flags);
1363                 else
1364                         clear_bit(SOCK_PASSSEC, &sock->flags);
1365                 break;
1366         case SO_MARK:
1367                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1368                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1369                         ret = -EPERM;
1370                         break;
1371                 }
1372
1373                 __sock_set_mark(sk, val);
1374                 break;
1375         case SO_RCVMARK:
1376                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1377                 break;
1378
1379         case SO_RXQ_OVFL:
1380                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1381                 break;
1382
1383         case SO_WIFI_STATUS:
1384                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1385                 break;
1386
1387         case SO_PEEK_OFF:
1388                 if (sock->ops->set_peek_off)
1389                         ret = sock->ops->set_peek_off(sk, val);
1390                 else
1391                         ret = -EOPNOTSUPP;
1392                 break;
1393
1394         case SO_NOFCS:
1395                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1396                 break;
1397
1398         case SO_SELECT_ERR_QUEUE:
1399                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1400                 break;
1401
1402 #ifdef CONFIG_NET_RX_BUSY_POLL
1403         case SO_BUSY_POLL:
1404                 if (val < 0)
1405                         ret = -EINVAL;
1406                 else
1407                         WRITE_ONCE(sk->sk_ll_usec, val);
1408                 break;
1409         case SO_PREFER_BUSY_POLL:
1410                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1411                         ret = -EPERM;
1412                 else
1413                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1414                 break;
1415         case SO_BUSY_POLL_BUDGET:
1416                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1417                         ret = -EPERM;
1418                 } else {
1419                         if (val < 0 || val > U16_MAX)
1420                                 ret = -EINVAL;
1421                         else
1422                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1423                 }
1424                 break;
1425 #endif
1426
1427         case SO_MAX_PACING_RATE:
1428                 {
1429                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1430
1431                 if (sizeof(ulval) != sizeof(val) &&
1432                     optlen >= sizeof(ulval) &&
1433                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1434                         ret = -EFAULT;
1435                         break;
1436                 }
1437                 if (ulval != ~0UL)
1438                         cmpxchg(&sk->sk_pacing_status,
1439                                 SK_PACING_NONE,
1440                                 SK_PACING_NEEDED);
1441                 sk->sk_max_pacing_rate = ulval;
1442                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1443                 break;
1444                 }
1445         case SO_INCOMING_CPU:
1446                 reuseport_update_incoming_cpu(sk, val);
1447                 break;
1448
1449         case SO_CNX_ADVICE:
1450                 if (val == 1)
1451                         dst_negative_advice(sk);
1452                 break;
1453
1454         case SO_ZEROCOPY:
1455                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1456                         if (!(sk_is_tcp(sk) ||
1457                               (sk->sk_type == SOCK_DGRAM &&
1458                                sk->sk_protocol == IPPROTO_UDP)))
1459                                 ret = -EOPNOTSUPP;
1460                 } else if (sk->sk_family != PF_RDS) {
1461                         ret = -EOPNOTSUPP;
1462                 }
1463                 if (!ret) {
1464                         if (val < 0 || val > 1)
1465                                 ret = -EINVAL;
1466                         else
1467                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1468                 }
1469                 break;
1470
1471         case SO_TXTIME:
1472                 if (optlen != sizeof(struct sock_txtime)) {
1473                         ret = -EINVAL;
1474                         break;
1475                 } else if (copy_from_sockptr(&sk_txtime, optval,
1476                            sizeof(struct sock_txtime))) {
1477                         ret = -EFAULT;
1478                         break;
1479                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1480                         ret = -EINVAL;
1481                         break;
1482                 }
1483                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1484                  * scheduler has enough safe guards.
1485                  */
1486                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1487                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1488                         ret = -EPERM;
1489                         break;
1490                 }
1491                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1492                 sk->sk_clockid = sk_txtime.clockid;
1493                 sk->sk_txtime_deadline_mode =
1494                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1495                 sk->sk_txtime_report_errors =
1496                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1497                 break;
1498
1499         case SO_BINDTOIFINDEX:
1500                 ret = sock_bindtoindex_locked(sk, val);
1501                 break;
1502
1503         case SO_BUF_LOCK:
1504                 if (val & ~SOCK_BUF_LOCK_MASK) {
1505                         ret = -EINVAL;
1506                         break;
1507                 }
1508                 sk->sk_userlocks = val | (sk->sk_userlocks &
1509                                           ~SOCK_BUF_LOCK_MASK);
1510                 break;
1511
1512         case SO_RESERVE_MEM:
1513         {
1514                 int delta;
1515
1516                 if (val < 0) {
1517                         ret = -EINVAL;
1518                         break;
1519                 }
1520
1521                 delta = val - sk->sk_reserved_mem;
1522                 if (delta < 0)
1523                         sock_release_reserved_memory(sk, -delta);
1524                 else
1525                         ret = sock_reserve_memory(sk, delta);
1526                 break;
1527         }
1528
1529         case SO_TXREHASH:
1530                 if (val < -1 || val > 1) {
1531                         ret = -EINVAL;
1532                         break;
1533                 }
1534                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1535                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1536                 /* Paired with READ_ONCE() in tcp_rtx_synack() */
1537                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1538                 break;
1539
1540         default:
1541                 ret = -ENOPROTOOPT;
1542                 break;
1543         }
1544         sockopt_release_sock(sk);
1545         return ret;
1546 }
1547
1548 int sock_setsockopt(struct socket *sock, int level, int optname,
1549                     sockptr_t optval, unsigned int optlen)
1550 {
1551         return sk_setsockopt(sock->sk, level, optname,
1552                              optval, optlen);
1553 }
1554 EXPORT_SYMBOL(sock_setsockopt);
1555
1556 static const struct cred *sk_get_peer_cred(struct sock *sk)
1557 {
1558         const struct cred *cred;
1559
1560         spin_lock(&sk->sk_peer_lock);
1561         cred = get_cred(sk->sk_peer_cred);
1562         spin_unlock(&sk->sk_peer_lock);
1563
1564         return cred;
1565 }
1566
1567 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1568                           struct ucred *ucred)
1569 {
1570         ucred->pid = pid_vnr(pid);
1571         ucred->uid = ucred->gid = -1;
1572         if (cred) {
1573                 struct user_namespace *current_ns = current_user_ns();
1574
1575                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1576                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1577         }
1578 }
1579
1580 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1581 {
1582         struct user_namespace *user_ns = current_user_ns();
1583         int i;
1584
1585         for (i = 0; i < src->ngroups; i++) {
1586                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1587
1588                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1589                         return -EFAULT;
1590         }
1591
1592         return 0;
1593 }
1594
1595 int sk_getsockopt(struct sock *sk, int level, int optname,
1596                   sockptr_t optval, sockptr_t optlen)
1597 {
1598         struct socket *sock = sk->sk_socket;
1599
1600         union {
1601                 int val;
1602                 u64 val64;
1603                 unsigned long ulval;
1604                 struct linger ling;
1605                 struct old_timeval32 tm32;
1606                 struct __kernel_old_timeval tm;
1607                 struct  __kernel_sock_timeval stm;
1608                 struct sock_txtime txtime;
1609                 struct so_timestamping timestamping;
1610         } v;
1611
1612         int lv = sizeof(int);
1613         int len;
1614
1615         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1616                 return -EFAULT;
1617         if (len < 0)
1618                 return -EINVAL;
1619
1620         memset(&v, 0, sizeof(v));
1621
1622         switch (optname) {
1623         case SO_DEBUG:
1624                 v.val = sock_flag(sk, SOCK_DBG);
1625                 break;
1626
1627         case SO_DONTROUTE:
1628                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1629                 break;
1630
1631         case SO_BROADCAST:
1632                 v.val = sock_flag(sk, SOCK_BROADCAST);
1633                 break;
1634
1635         case SO_SNDBUF:
1636                 v.val = sk->sk_sndbuf;
1637                 break;
1638
1639         case SO_RCVBUF:
1640                 v.val = sk->sk_rcvbuf;
1641                 break;
1642
1643         case SO_REUSEADDR:
1644                 v.val = sk->sk_reuse;
1645                 break;
1646
1647         case SO_REUSEPORT:
1648                 v.val = sk->sk_reuseport;
1649                 break;
1650
1651         case SO_KEEPALIVE:
1652                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1653                 break;
1654
1655         case SO_TYPE:
1656                 v.val = sk->sk_type;
1657                 break;
1658
1659         case SO_PROTOCOL:
1660                 v.val = sk->sk_protocol;
1661                 break;
1662
1663         case SO_DOMAIN:
1664                 v.val = sk->sk_family;
1665                 break;
1666
1667         case SO_ERROR:
1668                 v.val = -sock_error(sk);
1669                 if (v.val == 0)
1670                         v.val = xchg(&sk->sk_err_soft, 0);
1671                 break;
1672
1673         case SO_OOBINLINE:
1674                 v.val = sock_flag(sk, SOCK_URGINLINE);
1675                 break;
1676
1677         case SO_NO_CHECK:
1678                 v.val = sk->sk_no_check_tx;
1679                 break;
1680
1681         case SO_PRIORITY:
1682                 v.val = sk->sk_priority;
1683                 break;
1684
1685         case SO_LINGER:
1686                 lv              = sizeof(v.ling);
1687                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1688                 v.ling.l_linger = sk->sk_lingertime / HZ;
1689                 break;
1690
1691         case SO_BSDCOMPAT:
1692                 break;
1693
1694         case SO_TIMESTAMP_OLD:
1695                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1696                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1697                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1698                 break;
1699
1700         case SO_TIMESTAMPNS_OLD:
1701                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1702                 break;
1703
1704         case SO_TIMESTAMP_NEW:
1705                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1706                 break;
1707
1708         case SO_TIMESTAMPNS_NEW:
1709                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1710                 break;
1711
1712         case SO_TIMESTAMPING_OLD:
1713                 lv = sizeof(v.timestamping);
1714                 v.timestamping.flags = sk->sk_tsflags;
1715                 v.timestamping.bind_phc = sk->sk_bind_phc;
1716                 break;
1717
1718         case SO_RCVTIMEO_OLD:
1719         case SO_RCVTIMEO_NEW:
1720                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1721                 break;
1722
1723         case SO_SNDTIMEO_OLD:
1724         case SO_SNDTIMEO_NEW:
1725                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1726                 break;
1727
1728         case SO_RCVLOWAT:
1729                 v.val = sk->sk_rcvlowat;
1730                 break;
1731
1732         case SO_SNDLOWAT:
1733                 v.val = 1;
1734                 break;
1735
1736         case SO_PASSCRED:
1737                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1738                 break;
1739
1740         case SO_PASSPIDFD:
1741                 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1742                 break;
1743
1744         case SO_PEERCRED:
1745         {
1746                 struct ucred peercred;
1747                 if (len > sizeof(peercred))
1748                         len = sizeof(peercred);
1749
1750                 spin_lock(&sk->sk_peer_lock);
1751                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1752                 spin_unlock(&sk->sk_peer_lock);
1753
1754                 if (copy_to_sockptr(optval, &peercred, len))
1755                         return -EFAULT;
1756                 goto lenout;
1757         }
1758
1759         case SO_PEERPIDFD:
1760         {
1761                 struct pid *peer_pid;
1762                 struct file *pidfd_file = NULL;
1763                 int pidfd;
1764
1765                 if (len > sizeof(pidfd))
1766                         len = sizeof(pidfd);
1767
1768                 spin_lock(&sk->sk_peer_lock);
1769                 peer_pid = get_pid(sk->sk_peer_pid);
1770                 spin_unlock(&sk->sk_peer_lock);
1771
1772                 if (!peer_pid)
1773                         return -ESRCH;
1774
1775                 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1776                 put_pid(peer_pid);
1777                 if (pidfd < 0)
1778                         return pidfd;
1779
1780                 if (copy_to_sockptr(optval, &pidfd, len) ||
1781                     copy_to_sockptr(optlen, &len, sizeof(int))) {
1782                         put_unused_fd(pidfd);
1783                         fput(pidfd_file);
1784
1785                         return -EFAULT;
1786                 }
1787
1788                 fd_install(pidfd, pidfd_file);
1789                 return 0;
1790         }
1791
1792         case SO_PEERGROUPS:
1793         {
1794                 const struct cred *cred;
1795                 int ret, n;
1796
1797                 cred = sk_get_peer_cred(sk);
1798                 if (!cred)
1799                         return -ENODATA;
1800
1801                 n = cred->group_info->ngroups;
1802                 if (len < n * sizeof(gid_t)) {
1803                         len = n * sizeof(gid_t);
1804                         put_cred(cred);
1805                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1806                 }
1807                 len = n * sizeof(gid_t);
1808
1809                 ret = groups_to_user(optval, cred->group_info);
1810                 put_cred(cred);
1811                 if (ret)
1812                         return ret;
1813                 goto lenout;
1814         }
1815
1816         case SO_PEERNAME:
1817         {
1818                 char address[128];
1819
1820                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1821                 if (lv < 0)
1822                         return -ENOTCONN;
1823                 if (lv < len)
1824                         return -EINVAL;
1825                 if (copy_to_sockptr(optval, address, len))
1826                         return -EFAULT;
1827                 goto lenout;
1828         }
1829
1830         /* Dubious BSD thing... Probably nobody even uses it, but
1831          * the UNIX standard wants it for whatever reason... -DaveM
1832          */
1833         case SO_ACCEPTCONN:
1834                 v.val = sk->sk_state == TCP_LISTEN;
1835                 break;
1836
1837         case SO_PASSSEC:
1838                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1839                 break;
1840
1841         case SO_PEERSEC:
1842                 return security_socket_getpeersec_stream(sock,
1843                                                          optval, optlen, len);
1844
1845         case SO_MARK:
1846                 v.val = sk->sk_mark;
1847                 break;
1848
1849         case SO_RCVMARK:
1850                 v.val = sock_flag(sk, SOCK_RCVMARK);
1851                 break;
1852
1853         case SO_RXQ_OVFL:
1854                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1855                 break;
1856
1857         case SO_WIFI_STATUS:
1858                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1859                 break;
1860
1861         case SO_PEEK_OFF:
1862                 if (!sock->ops->set_peek_off)
1863                         return -EOPNOTSUPP;
1864
1865                 v.val = sk->sk_peek_off;
1866                 break;
1867         case SO_NOFCS:
1868                 v.val = sock_flag(sk, SOCK_NOFCS);
1869                 break;
1870
1871         case SO_BINDTODEVICE:
1872                 return sock_getbindtodevice(sk, optval, optlen, len);
1873
1874         case SO_GET_FILTER:
1875                 len = sk_get_filter(sk, optval, len);
1876                 if (len < 0)
1877                         return len;
1878
1879                 goto lenout;
1880
1881         case SO_LOCK_FILTER:
1882                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1883                 break;
1884
1885         case SO_BPF_EXTENSIONS:
1886                 v.val = bpf_tell_extensions();
1887                 break;
1888
1889         case SO_SELECT_ERR_QUEUE:
1890                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1891                 break;
1892
1893 #ifdef CONFIG_NET_RX_BUSY_POLL
1894         case SO_BUSY_POLL:
1895                 v.val = sk->sk_ll_usec;
1896                 break;
1897         case SO_PREFER_BUSY_POLL:
1898                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1899                 break;
1900 #endif
1901
1902         case SO_MAX_PACING_RATE:
1903                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1904                         lv = sizeof(v.ulval);
1905                         v.ulval = sk->sk_max_pacing_rate;
1906                 } else {
1907                         /* 32bit version */
1908                         v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1909                 }
1910                 break;
1911
1912         case SO_INCOMING_CPU:
1913                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1914                 break;
1915
1916         case SO_MEMINFO:
1917         {
1918                 u32 meminfo[SK_MEMINFO_VARS];
1919
1920                 sk_get_meminfo(sk, meminfo);
1921
1922                 len = min_t(unsigned int, len, sizeof(meminfo));
1923                 if (copy_to_sockptr(optval, &meminfo, len))
1924                         return -EFAULT;
1925
1926                 goto lenout;
1927         }
1928
1929 #ifdef CONFIG_NET_RX_BUSY_POLL
1930         case SO_INCOMING_NAPI_ID:
1931                 v.val = READ_ONCE(sk->sk_napi_id);
1932
1933                 /* aggregate non-NAPI IDs down to 0 */
1934                 if (v.val < MIN_NAPI_ID)
1935                         v.val = 0;
1936
1937                 break;
1938 #endif
1939
1940         case SO_COOKIE:
1941                 lv = sizeof(u64);
1942                 if (len < lv)
1943                         return -EINVAL;
1944                 v.val64 = sock_gen_cookie(sk);
1945                 break;
1946
1947         case SO_ZEROCOPY:
1948                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1949                 break;
1950
1951         case SO_TXTIME:
1952                 lv = sizeof(v.txtime);
1953                 v.txtime.clockid = sk->sk_clockid;
1954                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1955                                   SOF_TXTIME_DEADLINE_MODE : 0;
1956                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1957                                   SOF_TXTIME_REPORT_ERRORS : 0;
1958                 break;
1959
1960         case SO_BINDTOIFINDEX:
1961                 v.val = READ_ONCE(sk->sk_bound_dev_if);
1962                 break;
1963
1964         case SO_NETNS_COOKIE:
1965                 lv = sizeof(u64);
1966                 if (len != lv)
1967                         return -EINVAL;
1968                 v.val64 = sock_net(sk)->net_cookie;
1969                 break;
1970
1971         case SO_BUF_LOCK:
1972                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1973                 break;
1974
1975         case SO_RESERVE_MEM:
1976                 v.val = sk->sk_reserved_mem;
1977                 break;
1978
1979         case SO_TXREHASH:
1980                 v.val = sk->sk_txrehash;
1981                 break;
1982
1983         default:
1984                 /* We implement the SO_SNDLOWAT etc to not be settable
1985                  * (1003.1g 7).
1986                  */
1987                 return -ENOPROTOOPT;
1988         }
1989
1990         if (len > lv)
1991                 len = lv;
1992         if (copy_to_sockptr(optval, &v, len))
1993                 return -EFAULT;
1994 lenout:
1995         if (copy_to_sockptr(optlen, &len, sizeof(int)))
1996                 return -EFAULT;
1997         return 0;
1998 }
1999
2000 int sock_getsockopt(struct socket *sock, int level, int optname,
2001                     char __user *optval, int __user *optlen)
2002 {
2003         return sk_getsockopt(sock->sk, level, optname,
2004                              USER_SOCKPTR(optval),
2005                              USER_SOCKPTR(optlen));
2006 }
2007
2008 /*
2009  * Initialize an sk_lock.
2010  *
2011  * (We also register the sk_lock with the lock validator.)
2012  */
2013 static inline void sock_lock_init(struct sock *sk)
2014 {
2015         if (sk->sk_kern_sock)
2016                 sock_lock_init_class_and_name(
2017                         sk,
2018                         af_family_kern_slock_key_strings[sk->sk_family],
2019                         af_family_kern_slock_keys + sk->sk_family,
2020                         af_family_kern_key_strings[sk->sk_family],
2021                         af_family_kern_keys + sk->sk_family);
2022         else
2023                 sock_lock_init_class_and_name(
2024                         sk,
2025                         af_family_slock_key_strings[sk->sk_family],
2026                         af_family_slock_keys + sk->sk_family,
2027                         af_family_key_strings[sk->sk_family],
2028                         af_family_keys + sk->sk_family);
2029 }
2030
2031 /*
2032  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2033  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2034  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2035  */
2036 static void sock_copy(struct sock *nsk, const struct sock *osk)
2037 {
2038         const struct proto *prot = READ_ONCE(osk->sk_prot);
2039 #ifdef CONFIG_SECURITY_NETWORK
2040         void *sptr = nsk->sk_security;
2041 #endif
2042
2043         /* If we move sk_tx_queue_mapping out of the private section,
2044          * we must check if sk_tx_queue_clear() is called after
2045          * sock_copy() in sk_clone_lock().
2046          */
2047         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2048                      offsetof(struct sock, sk_dontcopy_begin) ||
2049                      offsetof(struct sock, sk_tx_queue_mapping) >=
2050                      offsetof(struct sock, sk_dontcopy_end));
2051
2052         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2053
2054         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2055                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2056
2057 #ifdef CONFIG_SECURITY_NETWORK
2058         nsk->sk_security = sptr;
2059         security_sk_clone(osk, nsk);
2060 #endif
2061 }
2062
2063 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2064                 int family)
2065 {
2066         struct sock *sk;
2067         struct kmem_cache *slab;
2068
2069         slab = prot->slab;
2070         if (slab != NULL) {
2071                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2072                 if (!sk)
2073                         return sk;
2074                 if (want_init_on_alloc(priority))
2075                         sk_prot_clear_nulls(sk, prot->obj_size);
2076         } else
2077                 sk = kmalloc(prot->obj_size, priority);
2078
2079         if (sk != NULL) {
2080                 if (security_sk_alloc(sk, family, priority))
2081                         goto out_free;
2082
2083                 if (!try_module_get(prot->owner))
2084                         goto out_free_sec;
2085         }
2086
2087         return sk;
2088
2089 out_free_sec:
2090         security_sk_free(sk);
2091 out_free:
2092         if (slab != NULL)
2093                 kmem_cache_free(slab, sk);
2094         else
2095                 kfree(sk);
2096         return NULL;
2097 }
2098
2099 static void sk_prot_free(struct proto *prot, struct sock *sk)
2100 {
2101         struct kmem_cache *slab;
2102         struct module *owner;
2103
2104         owner = prot->owner;
2105         slab = prot->slab;
2106
2107         cgroup_sk_free(&sk->sk_cgrp_data);
2108         mem_cgroup_sk_free(sk);
2109         security_sk_free(sk);
2110         if (slab != NULL)
2111                 kmem_cache_free(slab, sk);
2112         else
2113                 kfree(sk);
2114         module_put(owner);
2115 }
2116
2117 /**
2118  *      sk_alloc - All socket objects are allocated here
2119  *      @net: the applicable net namespace
2120  *      @family: protocol family
2121  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2122  *      @prot: struct proto associated with this new sock instance
2123  *      @kern: is this to be a kernel socket?
2124  */
2125 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2126                       struct proto *prot, int kern)
2127 {
2128         struct sock *sk;
2129
2130         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2131         if (sk) {
2132                 sk->sk_family = family;
2133                 /*
2134                  * See comment in struct sock definition to understand
2135                  * why we need sk_prot_creator -acme
2136                  */
2137                 sk->sk_prot = sk->sk_prot_creator = prot;
2138                 sk->sk_kern_sock = kern;
2139                 sock_lock_init(sk);
2140                 sk->sk_net_refcnt = kern ? 0 : 1;
2141                 if (likely(sk->sk_net_refcnt)) {
2142                         get_net_track(net, &sk->ns_tracker, priority);
2143                         sock_inuse_add(net, 1);
2144                 } else {
2145                         __netns_tracker_alloc(net, &sk->ns_tracker,
2146                                               false, priority);
2147                 }
2148
2149                 sock_net_set(sk, net);
2150                 refcount_set(&sk->sk_wmem_alloc, 1);
2151
2152                 mem_cgroup_sk_alloc(sk);
2153                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2154                 sock_update_classid(&sk->sk_cgrp_data);
2155                 sock_update_netprioidx(&sk->sk_cgrp_data);
2156                 sk_tx_queue_clear(sk);
2157         }
2158
2159         return sk;
2160 }
2161 EXPORT_SYMBOL(sk_alloc);
2162
2163 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2164  * grace period. This is the case for UDP sockets and TCP listeners.
2165  */
2166 static void __sk_destruct(struct rcu_head *head)
2167 {
2168         struct sock *sk = container_of(head, struct sock, sk_rcu);
2169         struct sk_filter *filter;
2170
2171         if (sk->sk_destruct)
2172                 sk->sk_destruct(sk);
2173
2174         filter = rcu_dereference_check(sk->sk_filter,
2175                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2176         if (filter) {
2177                 sk_filter_uncharge(sk, filter);
2178                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2179         }
2180
2181         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2182
2183 #ifdef CONFIG_BPF_SYSCALL
2184         bpf_sk_storage_free(sk);
2185 #endif
2186
2187         if (atomic_read(&sk->sk_omem_alloc))
2188                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2189                          __func__, atomic_read(&sk->sk_omem_alloc));
2190
2191         if (sk->sk_frag.page) {
2192                 put_page(sk->sk_frag.page);
2193                 sk->sk_frag.page = NULL;
2194         }
2195
2196         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2197         put_cred(sk->sk_peer_cred);
2198         put_pid(sk->sk_peer_pid);
2199
2200         if (likely(sk->sk_net_refcnt))
2201                 put_net_track(sock_net(sk), &sk->ns_tracker);
2202         else
2203                 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2204
2205         sk_prot_free(sk->sk_prot_creator, sk);
2206 }
2207
2208 void sk_destruct(struct sock *sk)
2209 {
2210         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2211
2212         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2213                 reuseport_detach_sock(sk);
2214                 use_call_rcu = true;
2215         }
2216
2217         if (use_call_rcu)
2218                 call_rcu(&sk->sk_rcu, __sk_destruct);
2219         else
2220                 __sk_destruct(&sk->sk_rcu);
2221 }
2222
2223 static void __sk_free(struct sock *sk)
2224 {
2225         if (likely(sk->sk_net_refcnt))
2226                 sock_inuse_add(sock_net(sk), -1);
2227
2228         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2229                 sock_diag_broadcast_destroy(sk);
2230         else
2231                 sk_destruct(sk);
2232 }
2233
2234 void sk_free(struct sock *sk)
2235 {
2236         /*
2237          * We subtract one from sk_wmem_alloc and can know if
2238          * some packets are still in some tx queue.
2239          * If not null, sock_wfree() will call __sk_free(sk) later
2240          */
2241         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2242                 __sk_free(sk);
2243 }
2244 EXPORT_SYMBOL(sk_free);
2245
2246 static void sk_init_common(struct sock *sk)
2247 {
2248         skb_queue_head_init(&sk->sk_receive_queue);
2249         skb_queue_head_init(&sk->sk_write_queue);
2250         skb_queue_head_init(&sk->sk_error_queue);
2251
2252         rwlock_init(&sk->sk_callback_lock);
2253         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2254                         af_rlock_keys + sk->sk_family,
2255                         af_family_rlock_key_strings[sk->sk_family]);
2256         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2257                         af_wlock_keys + sk->sk_family,
2258                         af_family_wlock_key_strings[sk->sk_family]);
2259         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2260                         af_elock_keys + sk->sk_family,
2261                         af_family_elock_key_strings[sk->sk_family]);
2262         lockdep_set_class_and_name(&sk->sk_callback_lock,
2263                         af_callback_keys + sk->sk_family,
2264                         af_family_clock_key_strings[sk->sk_family]);
2265 }
2266
2267 /**
2268  *      sk_clone_lock - clone a socket, and lock its clone
2269  *      @sk: the socket to clone
2270  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2271  *
2272  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2273  */
2274 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2275 {
2276         struct proto *prot = READ_ONCE(sk->sk_prot);
2277         struct sk_filter *filter;
2278         bool is_charged = true;
2279         struct sock *newsk;
2280
2281         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2282         if (!newsk)
2283                 goto out;
2284
2285         sock_copy(newsk, sk);
2286
2287         newsk->sk_prot_creator = prot;
2288
2289         /* SANITY */
2290         if (likely(newsk->sk_net_refcnt)) {
2291                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2292                 sock_inuse_add(sock_net(newsk), 1);
2293         } else {
2294                 /* Kernel sockets are not elevating the struct net refcount.
2295                  * Instead, use a tracker to more easily detect if a layer
2296                  * is not properly dismantling its kernel sockets at netns
2297                  * destroy time.
2298                  */
2299                 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2300                                       false, priority);
2301         }
2302         sk_node_init(&newsk->sk_node);
2303         sock_lock_init(newsk);
2304         bh_lock_sock(newsk);
2305         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2306         newsk->sk_backlog.len = 0;
2307
2308         atomic_set(&newsk->sk_rmem_alloc, 0);
2309
2310         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2311         refcount_set(&newsk->sk_wmem_alloc, 1);
2312
2313         atomic_set(&newsk->sk_omem_alloc, 0);
2314         sk_init_common(newsk);
2315
2316         newsk->sk_dst_cache     = NULL;
2317         newsk->sk_dst_pending_confirm = 0;
2318         newsk->sk_wmem_queued   = 0;
2319         newsk->sk_forward_alloc = 0;
2320         newsk->sk_reserved_mem  = 0;
2321         atomic_set(&newsk->sk_drops, 0);
2322         newsk->sk_send_head     = NULL;
2323         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2324         atomic_set(&newsk->sk_zckey, 0);
2325
2326         sock_reset_flag(newsk, SOCK_DONE);
2327
2328         /* sk->sk_memcg will be populated at accept() time */
2329         newsk->sk_memcg = NULL;
2330
2331         cgroup_sk_clone(&newsk->sk_cgrp_data);
2332
2333         rcu_read_lock();
2334         filter = rcu_dereference(sk->sk_filter);
2335         if (filter != NULL)
2336                 /* though it's an empty new sock, the charging may fail
2337                  * if sysctl_optmem_max was changed between creation of
2338                  * original socket and cloning
2339                  */
2340                 is_charged = sk_filter_charge(newsk, filter);
2341         RCU_INIT_POINTER(newsk->sk_filter, filter);
2342         rcu_read_unlock();
2343
2344         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2345                 /* We need to make sure that we don't uncharge the new
2346                  * socket if we couldn't charge it in the first place
2347                  * as otherwise we uncharge the parent's filter.
2348                  */
2349                 if (!is_charged)
2350                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2351                 sk_free_unlock_clone(newsk);
2352                 newsk = NULL;
2353                 goto out;
2354         }
2355         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2356
2357         if (bpf_sk_storage_clone(sk, newsk)) {
2358                 sk_free_unlock_clone(newsk);
2359                 newsk = NULL;
2360                 goto out;
2361         }
2362
2363         /* Clear sk_user_data if parent had the pointer tagged
2364          * as not suitable for copying when cloning.
2365          */
2366         if (sk_user_data_is_nocopy(newsk))
2367                 newsk->sk_user_data = NULL;
2368
2369         newsk->sk_err      = 0;
2370         newsk->sk_err_soft = 0;
2371         newsk->sk_priority = 0;
2372         newsk->sk_incoming_cpu = raw_smp_processor_id();
2373
2374         /* Before updating sk_refcnt, we must commit prior changes to memory
2375          * (Documentation/RCU/rculist_nulls.rst for details)
2376          */
2377         smp_wmb();
2378         refcount_set(&newsk->sk_refcnt, 2);
2379
2380         sk_set_socket(newsk, NULL);
2381         sk_tx_queue_clear(newsk);
2382         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2383
2384         if (newsk->sk_prot->sockets_allocated)
2385                 sk_sockets_allocated_inc(newsk);
2386
2387         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2388                 net_enable_timestamp();
2389 out:
2390         return newsk;
2391 }
2392 EXPORT_SYMBOL_GPL(sk_clone_lock);
2393
2394 void sk_free_unlock_clone(struct sock *sk)
2395 {
2396         /* It is still raw copy of parent, so invalidate
2397          * destructor and make plain sk_free() */
2398         sk->sk_destruct = NULL;
2399         bh_unlock_sock(sk);
2400         sk_free(sk);
2401 }
2402 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2403
2404 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2405 {
2406         bool is_ipv6 = false;
2407         u32 max_size;
2408
2409 #if IS_ENABLED(CONFIG_IPV6)
2410         is_ipv6 = (sk->sk_family == AF_INET6 &&
2411                    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2412 #endif
2413         /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2414         max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2415                         READ_ONCE(dst->dev->gso_ipv4_max_size);
2416         if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2417                 max_size = GSO_LEGACY_MAX_SIZE;
2418
2419         return max_size - (MAX_TCP_HEADER + 1);
2420 }
2421
2422 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2423 {
2424         u32 max_segs = 1;
2425
2426         sk->sk_route_caps = dst->dev->features;
2427         if (sk_is_tcp(sk))
2428                 sk->sk_route_caps |= NETIF_F_GSO;
2429         if (sk->sk_route_caps & NETIF_F_GSO)
2430                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2431         if (unlikely(sk->sk_gso_disabled))
2432                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2433         if (sk_can_gso(sk)) {
2434                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2435                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2436                 } else {
2437                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2438                         sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2439                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2440                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2441                 }
2442         }
2443         sk->sk_gso_max_segs = max_segs;
2444         sk_dst_set(sk, dst);
2445 }
2446 EXPORT_SYMBOL_GPL(sk_setup_caps);
2447
2448 /*
2449  *      Simple resource managers for sockets.
2450  */
2451
2452
2453 /*
2454  * Write buffer destructor automatically called from kfree_skb.
2455  */
2456 void sock_wfree(struct sk_buff *skb)
2457 {
2458         struct sock *sk = skb->sk;
2459         unsigned int len = skb->truesize;
2460         bool free;
2461
2462         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2463                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2464                     sk->sk_write_space == sock_def_write_space) {
2465                         rcu_read_lock();
2466                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2467                         sock_def_write_space_wfree(sk);
2468                         rcu_read_unlock();
2469                         if (unlikely(free))
2470                                 __sk_free(sk);
2471                         return;
2472                 }
2473
2474                 /*
2475                  * Keep a reference on sk_wmem_alloc, this will be released
2476                  * after sk_write_space() call
2477                  */
2478                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2479                 sk->sk_write_space(sk);
2480                 len = 1;
2481         }
2482         /*
2483          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2484          * could not do because of in-flight packets
2485          */
2486         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2487                 __sk_free(sk);
2488 }
2489 EXPORT_SYMBOL(sock_wfree);
2490
2491 /* This variant of sock_wfree() is used by TCP,
2492  * since it sets SOCK_USE_WRITE_QUEUE.
2493  */
2494 void __sock_wfree(struct sk_buff *skb)
2495 {
2496         struct sock *sk = skb->sk;
2497
2498         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2499                 __sk_free(sk);
2500 }
2501
2502 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2503 {
2504         skb_orphan(skb);
2505         skb->sk = sk;
2506 #ifdef CONFIG_INET
2507         if (unlikely(!sk_fullsock(sk))) {
2508                 skb->destructor = sock_edemux;
2509                 sock_hold(sk);
2510                 return;
2511         }
2512 #endif
2513         skb->destructor = sock_wfree;
2514         skb_set_hash_from_sk(skb, sk);
2515         /*
2516          * We used to take a refcount on sk, but following operation
2517          * is enough to guarantee sk_free() wont free this sock until
2518          * all in-flight packets are completed
2519          */
2520         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2521 }
2522 EXPORT_SYMBOL(skb_set_owner_w);
2523
2524 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2525 {
2526 #ifdef CONFIG_TLS_DEVICE
2527         /* Drivers depend on in-order delivery for crypto offload,
2528          * partial orphan breaks out-of-order-OK logic.
2529          */
2530         if (skb->decrypted)
2531                 return false;
2532 #endif
2533         return (skb->destructor == sock_wfree ||
2534                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2535 }
2536
2537 /* This helper is used by netem, as it can hold packets in its
2538  * delay queue. We want to allow the owner socket to send more
2539  * packets, as if they were already TX completed by a typical driver.
2540  * But we also want to keep skb->sk set because some packet schedulers
2541  * rely on it (sch_fq for example).
2542  */
2543 void skb_orphan_partial(struct sk_buff *skb)
2544 {
2545         if (skb_is_tcp_pure_ack(skb))
2546                 return;
2547
2548         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2549                 return;
2550
2551         skb_orphan(skb);
2552 }
2553 EXPORT_SYMBOL(skb_orphan_partial);
2554
2555 /*
2556  * Read buffer destructor automatically called from kfree_skb.
2557  */
2558 void sock_rfree(struct sk_buff *skb)
2559 {
2560         struct sock *sk = skb->sk;
2561         unsigned int len = skb->truesize;
2562
2563         atomic_sub(len, &sk->sk_rmem_alloc);
2564         sk_mem_uncharge(sk, len);
2565 }
2566 EXPORT_SYMBOL(sock_rfree);
2567
2568 /*
2569  * Buffer destructor for skbs that are not used directly in read or write
2570  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2571  */
2572 void sock_efree(struct sk_buff *skb)
2573 {
2574         sock_put(skb->sk);
2575 }
2576 EXPORT_SYMBOL(sock_efree);
2577
2578 /* Buffer destructor for prefetch/receive path where reference count may
2579  * not be held, e.g. for listen sockets.
2580  */
2581 #ifdef CONFIG_INET
2582 void sock_pfree(struct sk_buff *skb)
2583 {
2584         if (sk_is_refcounted(skb->sk))
2585                 sock_gen_put(skb->sk);
2586 }
2587 EXPORT_SYMBOL(sock_pfree);
2588 #endif /* CONFIG_INET */
2589
2590 kuid_t sock_i_uid(struct sock *sk)
2591 {
2592         kuid_t uid;
2593
2594         read_lock_bh(&sk->sk_callback_lock);
2595         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2596         read_unlock_bh(&sk->sk_callback_lock);
2597         return uid;
2598 }
2599 EXPORT_SYMBOL(sock_i_uid);
2600
2601 unsigned long __sock_i_ino(struct sock *sk)
2602 {
2603         unsigned long ino;
2604
2605         read_lock(&sk->sk_callback_lock);
2606         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2607         read_unlock(&sk->sk_callback_lock);
2608         return ino;
2609 }
2610 EXPORT_SYMBOL(__sock_i_ino);
2611
2612 unsigned long sock_i_ino(struct sock *sk)
2613 {
2614         unsigned long ino;
2615
2616         local_bh_disable();
2617         ino = __sock_i_ino(sk);
2618         local_bh_enable();
2619         return ino;
2620 }
2621 EXPORT_SYMBOL(sock_i_ino);
2622
2623 /*
2624  * Allocate a skb from the socket's send buffer.
2625  */
2626 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2627                              gfp_t priority)
2628 {
2629         if (force ||
2630             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2631                 struct sk_buff *skb = alloc_skb(size, priority);
2632
2633                 if (skb) {
2634                         skb_set_owner_w(skb, sk);
2635                         return skb;
2636                 }
2637         }
2638         return NULL;
2639 }
2640 EXPORT_SYMBOL(sock_wmalloc);
2641
2642 static void sock_ofree(struct sk_buff *skb)
2643 {
2644         struct sock *sk = skb->sk;
2645
2646         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2647 }
2648
2649 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2650                              gfp_t priority)
2651 {
2652         struct sk_buff *skb;
2653
2654         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2655         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2656             READ_ONCE(sysctl_optmem_max))
2657                 return NULL;
2658
2659         skb = alloc_skb(size, priority);
2660         if (!skb)
2661                 return NULL;
2662
2663         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2664         skb->sk = sk;
2665         skb->destructor = sock_ofree;
2666         return skb;
2667 }
2668
2669 /*
2670  * Allocate a memory block from the socket's option memory buffer.
2671  */
2672 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2673 {
2674         int optmem_max = READ_ONCE(sysctl_optmem_max);
2675
2676         if ((unsigned int)size <= optmem_max &&
2677             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2678                 void *mem;
2679                 /* First do the add, to avoid the race if kmalloc
2680                  * might sleep.
2681                  */
2682                 atomic_add(size, &sk->sk_omem_alloc);
2683                 mem = kmalloc(size, priority);
2684                 if (mem)
2685                         return mem;
2686                 atomic_sub(size, &sk->sk_omem_alloc);
2687         }
2688         return NULL;
2689 }
2690 EXPORT_SYMBOL(sock_kmalloc);
2691
2692 /* Free an option memory block. Note, we actually want the inline
2693  * here as this allows gcc to detect the nullify and fold away the
2694  * condition entirely.
2695  */
2696 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2697                                   const bool nullify)
2698 {
2699         if (WARN_ON_ONCE(!mem))
2700                 return;
2701         if (nullify)
2702                 kfree_sensitive(mem);
2703         else
2704                 kfree(mem);
2705         atomic_sub(size, &sk->sk_omem_alloc);
2706 }
2707
2708 void sock_kfree_s(struct sock *sk, void *mem, int size)
2709 {
2710         __sock_kfree_s(sk, mem, size, false);
2711 }
2712 EXPORT_SYMBOL(sock_kfree_s);
2713
2714 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2715 {
2716         __sock_kfree_s(sk, mem, size, true);
2717 }
2718 EXPORT_SYMBOL(sock_kzfree_s);
2719
2720 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2721    I think, these locks should be removed for datagram sockets.
2722  */
2723 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2724 {
2725         DEFINE_WAIT(wait);
2726
2727         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2728         for (;;) {
2729                 if (!timeo)
2730                         break;
2731                 if (signal_pending(current))
2732                         break;
2733                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2734                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2735                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2736                         break;
2737                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2738                         break;
2739                 if (sk->sk_err)
2740                         break;
2741                 timeo = schedule_timeout(timeo);
2742         }
2743         finish_wait(sk_sleep(sk), &wait);
2744         return timeo;
2745 }
2746
2747
2748 /*
2749  *      Generic send/receive buffer handlers
2750  */
2751
2752 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2753                                      unsigned long data_len, int noblock,
2754                                      int *errcode, int max_page_order)
2755 {
2756         struct sk_buff *skb;
2757         long timeo;
2758         int err;
2759
2760         timeo = sock_sndtimeo(sk, noblock);
2761         for (;;) {
2762                 err = sock_error(sk);
2763                 if (err != 0)
2764                         goto failure;
2765
2766                 err = -EPIPE;
2767                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2768                         goto failure;
2769
2770                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2771                         break;
2772
2773                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2774                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2775                 err = -EAGAIN;
2776                 if (!timeo)
2777                         goto failure;
2778                 if (signal_pending(current))
2779                         goto interrupted;
2780                 timeo = sock_wait_for_wmem(sk, timeo);
2781         }
2782         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2783                                    errcode, sk->sk_allocation);
2784         if (skb)
2785                 skb_set_owner_w(skb, sk);
2786         return skb;
2787
2788 interrupted:
2789         err = sock_intr_errno(timeo);
2790 failure:
2791         *errcode = err;
2792         return NULL;
2793 }
2794 EXPORT_SYMBOL(sock_alloc_send_pskb);
2795
2796 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2797                      struct sockcm_cookie *sockc)
2798 {
2799         u32 tsflags;
2800
2801         switch (cmsg->cmsg_type) {
2802         case SO_MARK:
2803                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2804                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2805                         return -EPERM;
2806                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2807                         return -EINVAL;
2808                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2809                 break;
2810         case SO_TIMESTAMPING_OLD:
2811                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2812                         return -EINVAL;
2813
2814                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2815                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2816                         return -EINVAL;
2817
2818                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2819                 sockc->tsflags |= tsflags;
2820                 break;
2821         case SCM_TXTIME:
2822                 if (!sock_flag(sk, SOCK_TXTIME))
2823                         return -EINVAL;
2824                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2825                         return -EINVAL;
2826                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2827                 break;
2828         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2829         case SCM_RIGHTS:
2830         case SCM_CREDENTIALS:
2831                 break;
2832         default:
2833                 return -EINVAL;
2834         }
2835         return 0;
2836 }
2837 EXPORT_SYMBOL(__sock_cmsg_send);
2838
2839 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2840                    struct sockcm_cookie *sockc)
2841 {
2842         struct cmsghdr *cmsg;
2843         int ret;
2844
2845         for_each_cmsghdr(cmsg, msg) {
2846                 if (!CMSG_OK(msg, cmsg))
2847                         return -EINVAL;
2848                 if (cmsg->cmsg_level != SOL_SOCKET)
2849                         continue;
2850                 ret = __sock_cmsg_send(sk, cmsg, sockc);
2851                 if (ret)
2852                         return ret;
2853         }
2854         return 0;
2855 }
2856 EXPORT_SYMBOL(sock_cmsg_send);
2857
2858 static void sk_enter_memory_pressure(struct sock *sk)
2859 {
2860         if (!sk->sk_prot->enter_memory_pressure)
2861                 return;
2862
2863         sk->sk_prot->enter_memory_pressure(sk);
2864 }
2865
2866 static void sk_leave_memory_pressure(struct sock *sk)
2867 {
2868         if (sk->sk_prot->leave_memory_pressure) {
2869                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2870                                      tcp_leave_memory_pressure, sk);
2871         } else {
2872                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2873
2874                 if (memory_pressure && READ_ONCE(*memory_pressure))
2875                         WRITE_ONCE(*memory_pressure, 0);
2876         }
2877 }
2878
2879 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2880
2881 /**
2882  * skb_page_frag_refill - check that a page_frag contains enough room
2883  * @sz: minimum size of the fragment we want to get
2884  * @pfrag: pointer to page_frag
2885  * @gfp: priority for memory allocation
2886  *
2887  * Note: While this allocator tries to use high order pages, there is
2888  * no guarantee that allocations succeed. Therefore, @sz MUST be
2889  * less or equal than PAGE_SIZE.
2890  */
2891 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2892 {
2893         if (pfrag->page) {
2894                 if (page_ref_count(pfrag->page) == 1) {
2895                         pfrag->offset = 0;
2896                         return true;
2897                 }
2898                 if (pfrag->offset + sz <= pfrag->size)
2899                         return true;
2900                 put_page(pfrag->page);
2901         }
2902
2903         pfrag->offset = 0;
2904         if (SKB_FRAG_PAGE_ORDER &&
2905             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2906                 /* Avoid direct reclaim but allow kswapd to wake */
2907                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2908                                           __GFP_COMP | __GFP_NOWARN |
2909                                           __GFP_NORETRY,
2910                                           SKB_FRAG_PAGE_ORDER);
2911                 if (likely(pfrag->page)) {
2912                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2913                         return true;
2914                 }
2915         }
2916         pfrag->page = alloc_page(gfp);
2917         if (likely(pfrag->page)) {
2918                 pfrag->size = PAGE_SIZE;
2919                 return true;
2920         }
2921         return false;
2922 }
2923 EXPORT_SYMBOL(skb_page_frag_refill);
2924
2925 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2926 {
2927         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2928                 return true;
2929
2930         sk_enter_memory_pressure(sk);
2931         sk_stream_moderate_sndbuf(sk);
2932         return false;
2933 }
2934 EXPORT_SYMBOL(sk_page_frag_refill);
2935
2936 void __lock_sock(struct sock *sk)
2937         __releases(&sk->sk_lock.slock)
2938         __acquires(&sk->sk_lock.slock)
2939 {
2940         DEFINE_WAIT(wait);
2941
2942         for (;;) {
2943                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2944                                         TASK_UNINTERRUPTIBLE);
2945                 spin_unlock_bh(&sk->sk_lock.slock);
2946                 schedule();
2947                 spin_lock_bh(&sk->sk_lock.slock);
2948                 if (!sock_owned_by_user(sk))
2949                         break;
2950         }
2951         finish_wait(&sk->sk_lock.wq, &wait);
2952 }
2953
2954 void __release_sock(struct sock *sk)
2955         __releases(&sk->sk_lock.slock)
2956         __acquires(&sk->sk_lock.slock)
2957 {
2958         struct sk_buff *skb, *next;
2959
2960         while ((skb = sk->sk_backlog.head) != NULL) {
2961                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2962
2963                 spin_unlock_bh(&sk->sk_lock.slock);
2964
2965                 do {
2966                         next = skb->next;
2967                         prefetch(next);
2968                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2969                         skb_mark_not_on_list(skb);
2970                         sk_backlog_rcv(sk, skb);
2971
2972                         cond_resched();
2973
2974                         skb = next;
2975                 } while (skb != NULL);
2976
2977                 spin_lock_bh(&sk->sk_lock.slock);
2978         }
2979
2980         /*
2981          * Doing the zeroing here guarantee we can not loop forever
2982          * while a wild producer attempts to flood us.
2983          */
2984         sk->sk_backlog.len = 0;
2985 }
2986
2987 void __sk_flush_backlog(struct sock *sk)
2988 {
2989         spin_lock_bh(&sk->sk_lock.slock);
2990         __release_sock(sk);
2991         spin_unlock_bh(&sk->sk_lock.slock);
2992 }
2993 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2994
2995 /**
2996  * sk_wait_data - wait for data to arrive at sk_receive_queue
2997  * @sk:    sock to wait on
2998  * @timeo: for how long
2999  * @skb:   last skb seen on sk_receive_queue
3000  *
3001  * Now socket state including sk->sk_err is changed only under lock,
3002  * hence we may omit checks after joining wait queue.
3003  * We check receive queue before schedule() only as optimization;
3004  * it is very likely that release_sock() added new data.
3005  */
3006 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3007 {
3008         DEFINE_WAIT_FUNC(wait, woken_wake_function);
3009         int rc;
3010
3011         add_wait_queue(sk_sleep(sk), &wait);
3012         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3013         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3014         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3015         remove_wait_queue(sk_sleep(sk), &wait);
3016         return rc;
3017 }
3018 EXPORT_SYMBOL(sk_wait_data);
3019
3020 /**
3021  *      __sk_mem_raise_allocated - increase memory_allocated
3022  *      @sk: socket
3023  *      @size: memory size to allocate
3024  *      @amt: pages to allocate
3025  *      @kind: allocation type
3026  *
3027  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3028  */
3029 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3030 {
3031         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3032         struct proto *prot = sk->sk_prot;
3033         bool charged = true;
3034         long allocated;
3035
3036         sk_memory_allocated_add(sk, amt);
3037         allocated = sk_memory_allocated(sk);
3038         if (memcg_charge &&
3039             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3040                                                 gfp_memcg_charge())))
3041                 goto suppress_allocation;
3042
3043         /* Under limit. */
3044         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3045                 sk_leave_memory_pressure(sk);
3046                 return 1;
3047         }
3048
3049         /* Under pressure. */
3050         if (allocated > sk_prot_mem_limits(sk, 1))
3051                 sk_enter_memory_pressure(sk);
3052
3053         /* Over hard limit. */
3054         if (allocated > sk_prot_mem_limits(sk, 2))
3055                 goto suppress_allocation;
3056
3057         /* guarantee minimum buffer size under pressure */
3058         if (kind == SK_MEM_RECV) {
3059                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3060                         return 1;
3061
3062         } else { /* SK_MEM_SEND */
3063                 int wmem0 = sk_get_wmem0(sk, prot);
3064
3065                 if (sk->sk_type == SOCK_STREAM) {
3066                         if (sk->sk_wmem_queued < wmem0)
3067                                 return 1;
3068                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3069                                 return 1;
3070                 }
3071         }
3072
3073         if (sk_has_memory_pressure(sk)) {
3074                 u64 alloc;
3075
3076                 if (!sk_under_memory_pressure(sk))
3077                         return 1;
3078                 alloc = sk_sockets_allocated_read_positive(sk);
3079                 if (sk_prot_mem_limits(sk, 2) > alloc *
3080                     sk_mem_pages(sk->sk_wmem_queued +
3081                                  atomic_read(&sk->sk_rmem_alloc) +
3082                                  sk->sk_forward_alloc))
3083                         return 1;
3084         }
3085
3086 suppress_allocation:
3087
3088         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3089                 sk_stream_moderate_sndbuf(sk);
3090
3091                 /* Fail only if socket is _under_ its sndbuf.
3092                  * In this case we cannot block, so that we have to fail.
3093                  */
3094                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3095                         /* Force charge with __GFP_NOFAIL */
3096                         if (memcg_charge && !charged) {
3097                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3098                                         gfp_memcg_charge() | __GFP_NOFAIL);
3099                         }
3100                         return 1;
3101                 }
3102         }
3103
3104         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3105                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3106
3107         sk_memory_allocated_sub(sk, amt);
3108
3109         if (memcg_charge && charged)
3110                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3111
3112         return 0;
3113 }
3114
3115 /**
3116  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3117  *      @sk: socket
3118  *      @size: memory size to allocate
3119  *      @kind: allocation type
3120  *
3121  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3122  *      rmem allocation. This function assumes that protocols which have
3123  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3124  */
3125 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3126 {
3127         int ret, amt = sk_mem_pages(size);
3128
3129         sk->sk_forward_alloc += amt << PAGE_SHIFT;
3130         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3131         if (!ret)
3132                 sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3133         return ret;
3134 }
3135 EXPORT_SYMBOL(__sk_mem_schedule);
3136
3137 /**
3138  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3139  *      @sk: socket
3140  *      @amount: number of quanta
3141  *
3142  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3143  */
3144 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3145 {
3146         sk_memory_allocated_sub(sk, amount);
3147
3148         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3149                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3150
3151         if (sk_under_memory_pressure(sk) &&
3152             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3153                 sk_leave_memory_pressure(sk);
3154 }
3155
3156 /**
3157  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3158  *      @sk: socket
3159  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3160  */
3161 void __sk_mem_reclaim(struct sock *sk, int amount)
3162 {
3163         amount >>= PAGE_SHIFT;
3164         sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3165         __sk_mem_reduce_allocated(sk, amount);
3166 }
3167 EXPORT_SYMBOL(__sk_mem_reclaim);
3168
3169 int sk_set_peek_off(struct sock *sk, int val)
3170 {
3171         sk->sk_peek_off = val;
3172         return 0;
3173 }
3174 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3175
3176 /*
3177  * Set of default routines for initialising struct proto_ops when
3178  * the protocol does not support a particular function. In certain
3179  * cases where it makes no sense for a protocol to have a "do nothing"
3180  * function, some default processing is provided.
3181  */
3182
3183 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3184 {
3185         return -EOPNOTSUPP;
3186 }
3187 EXPORT_SYMBOL(sock_no_bind);
3188
3189 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3190                     int len, int flags)
3191 {
3192         return -EOPNOTSUPP;
3193 }
3194 EXPORT_SYMBOL(sock_no_connect);
3195
3196 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3197 {
3198         return -EOPNOTSUPP;
3199 }
3200 EXPORT_SYMBOL(sock_no_socketpair);
3201
3202 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3203                    bool kern)
3204 {
3205         return -EOPNOTSUPP;
3206 }
3207 EXPORT_SYMBOL(sock_no_accept);
3208
3209 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3210                     int peer)
3211 {
3212         return -EOPNOTSUPP;
3213 }
3214 EXPORT_SYMBOL(sock_no_getname);
3215
3216 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3217 {
3218         return -EOPNOTSUPP;
3219 }
3220 EXPORT_SYMBOL(sock_no_ioctl);
3221
3222 int sock_no_listen(struct socket *sock, int backlog)
3223 {
3224         return -EOPNOTSUPP;
3225 }
3226 EXPORT_SYMBOL(sock_no_listen);
3227
3228 int sock_no_shutdown(struct socket *sock, int how)
3229 {
3230         return -EOPNOTSUPP;
3231 }
3232 EXPORT_SYMBOL(sock_no_shutdown);
3233
3234 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3235 {
3236         return -EOPNOTSUPP;
3237 }
3238 EXPORT_SYMBOL(sock_no_sendmsg);
3239
3240 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3241 {
3242         return -EOPNOTSUPP;
3243 }
3244 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3245
3246 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3247                     int flags)
3248 {
3249         return -EOPNOTSUPP;
3250 }
3251 EXPORT_SYMBOL(sock_no_recvmsg);
3252
3253 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3254 {
3255         /* Mirror missing mmap method error code */
3256         return -ENODEV;
3257 }
3258 EXPORT_SYMBOL(sock_no_mmap);
3259
3260 /*
3261  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3262  * various sock-based usage counts.
3263  */
3264 void __receive_sock(struct file *file)
3265 {
3266         struct socket *sock;
3267
3268         sock = sock_from_file(file);
3269         if (sock) {
3270                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3271                 sock_update_classid(&sock->sk->sk_cgrp_data);
3272         }
3273 }
3274
3275 /*
3276  *      Default Socket Callbacks
3277  */
3278
3279 static void sock_def_wakeup(struct sock *sk)
3280 {
3281         struct socket_wq *wq;
3282
3283         rcu_read_lock();
3284         wq = rcu_dereference(sk->sk_wq);
3285         if (skwq_has_sleeper(wq))
3286                 wake_up_interruptible_all(&wq->wait);
3287         rcu_read_unlock();
3288 }
3289
3290 static void sock_def_error_report(struct sock *sk)
3291 {
3292         struct socket_wq *wq;
3293
3294         rcu_read_lock();
3295         wq = rcu_dereference(sk->sk_wq);
3296         if (skwq_has_sleeper(wq))
3297                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3298         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3299         rcu_read_unlock();
3300 }
3301
3302 void sock_def_readable(struct sock *sk)
3303 {
3304         struct socket_wq *wq;
3305
3306         trace_sk_data_ready(sk);
3307
3308         rcu_read_lock();
3309         wq = rcu_dereference(sk->sk_wq);
3310         if (skwq_has_sleeper(wq))
3311                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3312                                                 EPOLLRDNORM | EPOLLRDBAND);
3313         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3314         rcu_read_unlock();
3315 }
3316
3317 static void sock_def_write_space(struct sock *sk)
3318 {
3319         struct socket_wq *wq;
3320
3321         rcu_read_lock();
3322
3323         /* Do not wake up a writer until he can make "significant"
3324          * progress.  --DaveM
3325          */
3326         if (sock_writeable(sk)) {
3327                 wq = rcu_dereference(sk->sk_wq);
3328                 if (skwq_has_sleeper(wq))
3329                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3330                                                 EPOLLWRNORM | EPOLLWRBAND);
3331
3332                 /* Should agree with poll, otherwise some programs break */
3333                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3334         }
3335
3336         rcu_read_unlock();
3337 }
3338
3339 /* An optimised version of sock_def_write_space(), should only be called
3340  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3341  * ->sk_wmem_alloc.
3342  */
3343 static void sock_def_write_space_wfree(struct sock *sk)
3344 {
3345         /* Do not wake up a writer until he can make "significant"
3346          * progress.  --DaveM
3347          */
3348         if (sock_writeable(sk)) {
3349                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3350
3351                 /* rely on refcount_sub from sock_wfree() */
3352                 smp_mb__after_atomic();
3353                 if (wq && waitqueue_active(&wq->wait))
3354                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3355                                                 EPOLLWRNORM | EPOLLWRBAND);
3356
3357                 /* Should agree with poll, otherwise some programs break */
3358                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3359         }
3360 }
3361
3362 static void sock_def_destruct(struct sock *sk)
3363 {
3364 }
3365
3366 void sk_send_sigurg(struct sock *sk)
3367 {
3368         if (sk->sk_socket && sk->sk_socket->file)
3369                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3370                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3371 }
3372 EXPORT_SYMBOL(sk_send_sigurg);
3373
3374 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3375                     unsigned long expires)
3376 {
3377         if (!mod_timer(timer, expires))
3378                 sock_hold(sk);
3379 }
3380 EXPORT_SYMBOL(sk_reset_timer);
3381
3382 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3383 {
3384         if (del_timer(timer))
3385                 __sock_put(sk);
3386 }
3387 EXPORT_SYMBOL(sk_stop_timer);
3388
3389 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3390 {
3391         if (del_timer_sync(timer))
3392                 __sock_put(sk);
3393 }
3394 EXPORT_SYMBOL(sk_stop_timer_sync);
3395
3396 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3397 {
3398         sk_init_common(sk);
3399         sk->sk_send_head        =       NULL;
3400
3401         timer_setup(&sk->sk_timer, NULL, 0);
3402
3403         sk->sk_allocation       =       GFP_KERNEL;
3404         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3405         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3406         sk->sk_state            =       TCP_CLOSE;
3407         sk->sk_use_task_frag    =       true;
3408         sk_set_socket(sk, sock);
3409
3410         sock_set_flag(sk, SOCK_ZAPPED);
3411
3412         if (sock) {
3413                 sk->sk_type     =       sock->type;
3414                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3415                 sock->sk        =       sk;
3416         } else {
3417                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3418         }
3419         sk->sk_uid      =       uid;
3420
3421         rwlock_init(&sk->sk_callback_lock);
3422         if (sk->sk_kern_sock)
3423                 lockdep_set_class_and_name(
3424                         &sk->sk_callback_lock,
3425                         af_kern_callback_keys + sk->sk_family,
3426                         af_family_kern_clock_key_strings[sk->sk_family]);
3427         else
3428                 lockdep_set_class_and_name(
3429                         &sk->sk_callback_lock,
3430                         af_callback_keys + sk->sk_family,
3431                         af_family_clock_key_strings[sk->sk_family]);
3432
3433         sk->sk_state_change     =       sock_def_wakeup;
3434         sk->sk_data_ready       =       sock_def_readable;
3435         sk->sk_write_space      =       sock_def_write_space;
3436         sk->sk_error_report     =       sock_def_error_report;
3437         sk->sk_destruct         =       sock_def_destruct;
3438
3439         sk->sk_frag.page        =       NULL;
3440         sk->sk_frag.offset      =       0;
3441         sk->sk_peek_off         =       -1;
3442
3443         sk->sk_peer_pid         =       NULL;
3444         sk->sk_peer_cred        =       NULL;
3445         spin_lock_init(&sk->sk_peer_lock);
3446
3447         sk->sk_write_pending    =       0;
3448         sk->sk_rcvlowat         =       1;
3449         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3450         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3451
3452         sk->sk_stamp = SK_DEFAULT_STAMP;
3453 #if BITS_PER_LONG==32
3454         seqlock_init(&sk->sk_stamp_seq);
3455 #endif
3456         atomic_set(&sk->sk_zckey, 0);
3457
3458 #ifdef CONFIG_NET_RX_BUSY_POLL
3459         sk->sk_napi_id          =       0;
3460         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3461 #endif
3462
3463         sk->sk_max_pacing_rate = ~0UL;
3464         sk->sk_pacing_rate = ~0UL;
3465         WRITE_ONCE(sk->sk_pacing_shift, 10);
3466         sk->sk_incoming_cpu = -1;
3467
3468         sk_rx_queue_clear(sk);
3469         /*
3470          * Before updating sk_refcnt, we must commit prior changes to memory
3471          * (Documentation/RCU/rculist_nulls.rst for details)
3472          */
3473         smp_wmb();
3474         refcount_set(&sk->sk_refcnt, 1);
3475         atomic_set(&sk->sk_drops, 0);
3476 }
3477 EXPORT_SYMBOL(sock_init_data_uid);
3478
3479 void sock_init_data(struct socket *sock, struct sock *sk)
3480 {
3481         kuid_t uid = sock ?
3482                 SOCK_INODE(sock)->i_uid :
3483                 make_kuid(sock_net(sk)->user_ns, 0);
3484
3485         sock_init_data_uid(sock, sk, uid);
3486 }
3487 EXPORT_SYMBOL(sock_init_data);
3488
3489 void lock_sock_nested(struct sock *sk, int subclass)
3490 {
3491         /* The sk_lock has mutex_lock() semantics here. */
3492         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3493
3494         might_sleep();
3495         spin_lock_bh(&sk->sk_lock.slock);
3496         if (sock_owned_by_user_nocheck(sk))
3497                 __lock_sock(sk);
3498         sk->sk_lock.owned = 1;
3499         spin_unlock_bh(&sk->sk_lock.slock);
3500 }
3501 EXPORT_SYMBOL(lock_sock_nested);
3502
3503 void release_sock(struct sock *sk)
3504 {
3505         spin_lock_bh(&sk->sk_lock.slock);
3506         if (sk->sk_backlog.tail)
3507                 __release_sock(sk);
3508
3509         /* Warning : release_cb() might need to release sk ownership,
3510          * ie call sock_release_ownership(sk) before us.
3511          */
3512         if (sk->sk_prot->release_cb)
3513                 sk->sk_prot->release_cb(sk);
3514
3515         sock_release_ownership(sk);
3516         if (waitqueue_active(&sk->sk_lock.wq))
3517                 wake_up(&sk->sk_lock.wq);
3518         spin_unlock_bh(&sk->sk_lock.slock);
3519 }
3520 EXPORT_SYMBOL(release_sock);
3521
3522 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3523 {
3524         might_sleep();
3525         spin_lock_bh(&sk->sk_lock.slock);
3526
3527         if (!sock_owned_by_user_nocheck(sk)) {
3528                 /*
3529                  * Fast path return with bottom halves disabled and
3530                  * sock::sk_lock.slock held.
3531                  *
3532                  * The 'mutex' is not contended and holding
3533                  * sock::sk_lock.slock prevents all other lockers to
3534                  * proceed so the corresponding unlock_sock_fast() can
3535                  * avoid the slow path of release_sock() completely and
3536                  * just release slock.
3537                  *
3538                  * From a semantical POV this is equivalent to 'acquiring'
3539                  * the 'mutex', hence the corresponding lockdep
3540                  * mutex_release() has to happen in the fast path of
3541                  * unlock_sock_fast().
3542                  */
3543                 return false;
3544         }
3545
3546         __lock_sock(sk);
3547         sk->sk_lock.owned = 1;
3548         __acquire(&sk->sk_lock.slock);
3549         spin_unlock_bh(&sk->sk_lock.slock);
3550         return true;
3551 }
3552 EXPORT_SYMBOL(__lock_sock_fast);
3553
3554 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3555                    bool timeval, bool time32)
3556 {
3557         struct sock *sk = sock->sk;
3558         struct timespec64 ts;
3559
3560         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3561         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3562         if (ts.tv_sec == -1)
3563                 return -ENOENT;
3564         if (ts.tv_sec == 0) {
3565                 ktime_t kt = ktime_get_real();
3566                 sock_write_timestamp(sk, kt);
3567                 ts = ktime_to_timespec64(kt);
3568         }
3569
3570         if (timeval)
3571                 ts.tv_nsec /= 1000;
3572
3573 #ifdef CONFIG_COMPAT_32BIT_TIME
3574         if (time32)
3575                 return put_old_timespec32(&ts, userstamp);
3576 #endif
3577 #ifdef CONFIG_SPARC64
3578         /* beware of padding in sparc64 timeval */
3579         if (timeval && !in_compat_syscall()) {
3580                 struct __kernel_old_timeval __user tv = {
3581                         .tv_sec = ts.tv_sec,
3582                         .tv_usec = ts.tv_nsec,
3583                 };
3584                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3585                         return -EFAULT;
3586                 return 0;
3587         }
3588 #endif
3589         return put_timespec64(&ts, userstamp);
3590 }
3591 EXPORT_SYMBOL(sock_gettstamp);
3592
3593 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3594 {
3595         if (!sock_flag(sk, flag)) {
3596                 unsigned long previous_flags = sk->sk_flags;
3597
3598                 sock_set_flag(sk, flag);
3599                 /*
3600                  * we just set one of the two flags which require net
3601                  * time stamping, but time stamping might have been on
3602                  * already because of the other one
3603                  */
3604                 if (sock_needs_netstamp(sk) &&
3605                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3606                         net_enable_timestamp();
3607         }
3608 }
3609
3610 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3611                        int level, int type)
3612 {
3613         struct sock_exterr_skb *serr;
3614         struct sk_buff *skb;
3615         int copied, err;
3616
3617         err = -EAGAIN;
3618         skb = sock_dequeue_err_skb(sk);
3619         if (skb == NULL)
3620                 goto out;
3621
3622         copied = skb->len;
3623         if (copied > len) {
3624                 msg->msg_flags |= MSG_TRUNC;
3625                 copied = len;
3626         }
3627         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3628         if (err)
3629                 goto out_free_skb;
3630
3631         sock_recv_timestamp(msg, sk, skb);
3632
3633         serr = SKB_EXT_ERR(skb);
3634         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3635
3636         msg->msg_flags |= MSG_ERRQUEUE;
3637         err = copied;
3638
3639 out_free_skb:
3640         kfree_skb(skb);
3641 out:
3642         return err;
3643 }
3644 EXPORT_SYMBOL(sock_recv_errqueue);
3645
3646 /*
3647  *      Get a socket option on an socket.
3648  *
3649  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3650  *      asynchronous errors should be reported by getsockopt. We assume
3651  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3652  */
3653 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3654                            char __user *optval, int __user *optlen)
3655 {
3656         struct sock *sk = sock->sk;
3657
3658         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3659         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3660 }
3661 EXPORT_SYMBOL(sock_common_getsockopt);
3662
3663 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3664                         int flags)
3665 {
3666         struct sock *sk = sock->sk;
3667         int addr_len = 0;
3668         int err;
3669
3670         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3671         if (err >= 0)
3672                 msg->msg_namelen = addr_len;
3673         return err;
3674 }
3675 EXPORT_SYMBOL(sock_common_recvmsg);
3676
3677 /*
3678  *      Set socket options on an inet socket.
3679  */
3680 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3681                            sockptr_t optval, unsigned int optlen)
3682 {
3683         struct sock *sk = sock->sk;
3684
3685         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3686         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3687 }
3688 EXPORT_SYMBOL(sock_common_setsockopt);
3689
3690 void sk_common_release(struct sock *sk)
3691 {
3692         if (sk->sk_prot->destroy)
3693                 sk->sk_prot->destroy(sk);
3694
3695         /*
3696          * Observation: when sk_common_release is called, processes have
3697          * no access to socket. But net still has.
3698          * Step one, detach it from networking:
3699          *
3700          * A. Remove from hash tables.
3701          */
3702
3703         sk->sk_prot->unhash(sk);
3704
3705         /*
3706          * In this point socket cannot receive new packets, but it is possible
3707          * that some packets are in flight because some CPU runs receiver and
3708          * did hash table lookup before we unhashed socket. They will achieve
3709          * receive queue and will be purged by socket destructor.
3710          *
3711          * Also we still have packets pending on receive queue and probably,
3712          * our own packets waiting in device queues. sock_destroy will drain
3713          * receive queue, but transmitted packets will delay socket destruction
3714          * until the last reference will be released.
3715          */
3716
3717         sock_orphan(sk);
3718
3719         xfrm_sk_free_policy(sk);
3720
3721         sock_put(sk);
3722 }
3723 EXPORT_SYMBOL(sk_common_release);
3724
3725 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3726 {
3727         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3728
3729         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3730         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3731         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3732         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3733         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3734         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3735         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3736         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3737         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3738 }
3739
3740 #ifdef CONFIG_PROC_FS
3741 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3742
3743 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3744 {
3745         int cpu, idx = prot->inuse_idx;
3746         int res = 0;
3747
3748         for_each_possible_cpu(cpu)
3749                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3750
3751         return res >= 0 ? res : 0;
3752 }
3753 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3754
3755 int sock_inuse_get(struct net *net)
3756 {
3757         int cpu, res = 0;
3758
3759         for_each_possible_cpu(cpu)
3760                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3761
3762         return res;
3763 }
3764
3765 EXPORT_SYMBOL_GPL(sock_inuse_get);
3766
3767 static int __net_init sock_inuse_init_net(struct net *net)
3768 {
3769         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3770         if (net->core.prot_inuse == NULL)
3771                 return -ENOMEM;
3772         return 0;
3773 }
3774
3775 static void __net_exit sock_inuse_exit_net(struct net *net)
3776 {
3777         free_percpu(net->core.prot_inuse);
3778 }
3779
3780 static struct pernet_operations net_inuse_ops = {
3781         .init = sock_inuse_init_net,
3782         .exit = sock_inuse_exit_net,
3783 };
3784
3785 static __init int net_inuse_init(void)
3786 {
3787         if (register_pernet_subsys(&net_inuse_ops))
3788                 panic("Cannot initialize net inuse counters");
3789
3790         return 0;
3791 }
3792
3793 core_initcall(net_inuse_init);
3794
3795 static int assign_proto_idx(struct proto *prot)
3796 {
3797         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3798
3799         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3800                 pr_err("PROTO_INUSE_NR exhausted\n");
3801                 return -ENOSPC;
3802         }
3803
3804         set_bit(prot->inuse_idx, proto_inuse_idx);
3805         return 0;
3806 }
3807
3808 static void release_proto_idx(struct proto *prot)
3809 {
3810         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3811                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3812 }
3813 #else
3814 static inline int assign_proto_idx(struct proto *prot)
3815 {
3816         return 0;
3817 }
3818
3819 static inline void release_proto_idx(struct proto *prot)
3820 {
3821 }
3822
3823 #endif
3824
3825 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3826 {
3827         if (!twsk_prot)
3828                 return;
3829         kfree(twsk_prot->twsk_slab_name);
3830         twsk_prot->twsk_slab_name = NULL;
3831         kmem_cache_destroy(twsk_prot->twsk_slab);
3832         twsk_prot->twsk_slab = NULL;
3833 }
3834
3835 static int tw_prot_init(const struct proto *prot)
3836 {
3837         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3838
3839         if (!twsk_prot)
3840                 return 0;
3841
3842         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3843                                               prot->name);
3844         if (!twsk_prot->twsk_slab_name)
3845                 return -ENOMEM;
3846
3847         twsk_prot->twsk_slab =
3848                 kmem_cache_create(twsk_prot->twsk_slab_name,
3849                                   twsk_prot->twsk_obj_size, 0,
3850                                   SLAB_ACCOUNT | prot->slab_flags,
3851                                   NULL);
3852         if (!twsk_prot->twsk_slab) {
3853                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3854                         prot->name);
3855                 return -ENOMEM;
3856         }
3857
3858         return 0;
3859 }
3860
3861 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3862 {
3863         if (!rsk_prot)
3864                 return;
3865         kfree(rsk_prot->slab_name);
3866         rsk_prot->slab_name = NULL;
3867         kmem_cache_destroy(rsk_prot->slab);
3868         rsk_prot->slab = NULL;
3869 }
3870
3871 static int req_prot_init(const struct proto *prot)
3872 {
3873         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3874
3875         if (!rsk_prot)
3876                 return 0;
3877
3878         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3879                                         prot->name);
3880         if (!rsk_prot->slab_name)
3881                 return -ENOMEM;
3882
3883         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3884                                            rsk_prot->obj_size, 0,
3885                                            SLAB_ACCOUNT | prot->slab_flags,
3886                                            NULL);
3887
3888         if (!rsk_prot->slab) {
3889                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3890                         prot->name);
3891                 return -ENOMEM;
3892         }
3893         return 0;
3894 }
3895
3896 int proto_register(struct proto *prot, int alloc_slab)
3897 {
3898         int ret = -ENOBUFS;
3899
3900         if (prot->memory_allocated && !prot->sysctl_mem) {
3901                 pr_err("%s: missing sysctl_mem\n", prot->name);
3902                 return -EINVAL;
3903         }
3904         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3905                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3906                 return -EINVAL;
3907         }
3908         if (alloc_slab) {
3909                 prot->slab = kmem_cache_create_usercopy(prot->name,
3910                                         prot->obj_size, 0,
3911                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3912                                         prot->slab_flags,
3913                                         prot->useroffset, prot->usersize,
3914                                         NULL);
3915
3916                 if (prot->slab == NULL) {
3917                         pr_crit("%s: Can't create sock SLAB cache!\n",
3918                                 prot->name);
3919                         goto out;
3920                 }
3921
3922                 if (req_prot_init(prot))
3923                         goto out_free_request_sock_slab;
3924
3925                 if (tw_prot_init(prot))
3926                         goto out_free_timewait_sock_slab;
3927         }
3928
3929         mutex_lock(&proto_list_mutex);
3930         ret = assign_proto_idx(prot);
3931         if (ret) {
3932                 mutex_unlock(&proto_list_mutex);
3933                 goto out_free_timewait_sock_slab;
3934         }
3935         list_add(&prot->node, &proto_list);
3936         mutex_unlock(&proto_list_mutex);
3937         return ret;
3938
3939 out_free_timewait_sock_slab:
3940         if (alloc_slab)
3941                 tw_prot_cleanup(prot->twsk_prot);
3942 out_free_request_sock_slab:
3943         if (alloc_slab) {
3944                 req_prot_cleanup(prot->rsk_prot);
3945
3946                 kmem_cache_destroy(prot->slab);
3947                 prot->slab = NULL;
3948         }
3949 out:
3950         return ret;
3951 }
3952 EXPORT_SYMBOL(proto_register);
3953
3954 void proto_unregister(struct proto *prot)
3955 {
3956         mutex_lock(&proto_list_mutex);
3957         release_proto_idx(prot);
3958         list_del(&prot->node);
3959         mutex_unlock(&proto_list_mutex);
3960
3961         kmem_cache_destroy(prot->slab);
3962         prot->slab = NULL;
3963
3964         req_prot_cleanup(prot->rsk_prot);
3965         tw_prot_cleanup(prot->twsk_prot);
3966 }
3967 EXPORT_SYMBOL(proto_unregister);
3968
3969 int sock_load_diag_module(int family, int protocol)
3970 {
3971         if (!protocol) {
3972                 if (!sock_is_registered(family))
3973                         return -ENOENT;
3974
3975                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3976                                       NETLINK_SOCK_DIAG, family);
3977         }
3978
3979 #ifdef CONFIG_INET
3980         if (family == AF_INET &&
3981             protocol != IPPROTO_RAW &&
3982             protocol < MAX_INET_PROTOS &&
3983             !rcu_access_pointer(inet_protos[protocol]))
3984                 return -ENOENT;
3985 #endif
3986
3987         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3988                               NETLINK_SOCK_DIAG, family, protocol);
3989 }
3990 EXPORT_SYMBOL(sock_load_diag_module);
3991
3992 #ifdef CONFIG_PROC_FS
3993 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3994         __acquires(proto_list_mutex)
3995 {
3996         mutex_lock(&proto_list_mutex);
3997         return seq_list_start_head(&proto_list, *pos);
3998 }
3999
4000 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4001 {
4002         return seq_list_next(v, &proto_list, pos);
4003 }
4004
4005 static void proto_seq_stop(struct seq_file *seq, void *v)
4006         __releases(proto_list_mutex)
4007 {
4008         mutex_unlock(&proto_list_mutex);
4009 }
4010
4011 static char proto_method_implemented(const void *method)
4012 {
4013         return method == NULL ? 'n' : 'y';
4014 }
4015 static long sock_prot_memory_allocated(struct proto *proto)
4016 {
4017         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4018 }
4019
4020 static const char *sock_prot_memory_pressure(struct proto *proto)
4021 {
4022         return proto->memory_pressure != NULL ?
4023         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4024 }
4025
4026 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4027 {
4028
4029         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4030                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4031                    proto->name,
4032                    proto->obj_size,
4033                    sock_prot_inuse_get(seq_file_net(seq), proto),
4034                    sock_prot_memory_allocated(proto),
4035                    sock_prot_memory_pressure(proto),
4036                    proto->max_header,
4037                    proto->slab == NULL ? "no" : "yes",
4038                    module_name(proto->owner),
4039                    proto_method_implemented(proto->close),
4040                    proto_method_implemented(proto->connect),
4041                    proto_method_implemented(proto->disconnect),
4042                    proto_method_implemented(proto->accept),
4043                    proto_method_implemented(proto->ioctl),
4044                    proto_method_implemented(proto->init),
4045                    proto_method_implemented(proto->destroy),
4046                    proto_method_implemented(proto->shutdown),
4047                    proto_method_implemented(proto->setsockopt),
4048                    proto_method_implemented(proto->getsockopt),
4049                    proto_method_implemented(proto->sendmsg),
4050                    proto_method_implemented(proto->recvmsg),
4051                    proto_method_implemented(proto->bind),
4052                    proto_method_implemented(proto->backlog_rcv),
4053                    proto_method_implemented(proto->hash),
4054                    proto_method_implemented(proto->unhash),
4055                    proto_method_implemented(proto->get_port),
4056                    proto_method_implemented(proto->enter_memory_pressure));
4057 }
4058
4059 static int proto_seq_show(struct seq_file *seq, void *v)
4060 {
4061         if (v == &proto_list)
4062                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4063                            "protocol",
4064                            "size",
4065                            "sockets",
4066                            "memory",
4067                            "press",
4068                            "maxhdr",
4069                            "slab",
4070                            "module",
4071                            "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4072         else
4073                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4074         return 0;
4075 }
4076
4077 static const struct seq_operations proto_seq_ops = {
4078         .start  = proto_seq_start,
4079         .next   = proto_seq_next,
4080         .stop   = proto_seq_stop,
4081         .show   = proto_seq_show,
4082 };
4083
4084 static __net_init int proto_init_net(struct net *net)
4085 {
4086         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4087                         sizeof(struct seq_net_private)))
4088                 return -ENOMEM;
4089
4090         return 0;
4091 }
4092
4093 static __net_exit void proto_exit_net(struct net *net)
4094 {
4095         remove_proc_entry("protocols", net->proc_net);
4096 }
4097
4098
4099 static __net_initdata struct pernet_operations proto_net_ops = {
4100         .init = proto_init_net,
4101         .exit = proto_exit_net,
4102 };
4103
4104 static int __init proto_init(void)
4105 {
4106         return register_pernet_subsys(&proto_net_ops);
4107 }
4108
4109 subsys_initcall(proto_init);
4110
4111 #endif /* PROC_FS */
4112
4113 #ifdef CONFIG_NET_RX_BUSY_POLL
4114 bool sk_busy_loop_end(void *p, unsigned long start_time)
4115 {
4116         struct sock *sk = p;
4117
4118         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4119                sk_busy_loop_timeout(sk, start_time);
4120 }
4121 EXPORT_SYMBOL(sk_busy_loop_end);
4122 #endif /* CONFIG_NET_RX_BUSY_POLL */
4123
4124 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4125 {
4126         if (!sk->sk_prot->bind_add)
4127                 return -EOPNOTSUPP;
4128         return sk->sk_prot->bind_add(sk, addr, addr_len);
4129 }
4130 EXPORT_SYMBOL(sock_bind_add);
4131
4132 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4133 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4134                      void __user *arg, void *karg, size_t size)
4135 {
4136         int ret;
4137
4138         if (copy_from_user(karg, arg, size))
4139                 return -EFAULT;
4140
4141         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4142         if (ret)
4143                 return ret;
4144
4145         if (copy_to_user(arg, karg, size))
4146                 return -EFAULT;
4147
4148         return 0;
4149 }
4150 EXPORT_SYMBOL(sock_ioctl_inout);
4151
4152 /* This is the most common ioctl prep function, where the result (4 bytes) is
4153  * copied back to userspace if the ioctl() returns successfully. No input is
4154  * copied from userspace as input argument.
4155  */
4156 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4157 {
4158         int ret, karg = 0;
4159
4160         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4161         if (ret)
4162                 return ret;
4163
4164         return put_user(karg, (int __user *)arg);
4165 }
4166
4167 /* A wrapper around sock ioctls, which copies the data from userspace
4168  * (depending on the protocol/ioctl), and copies back the result to userspace.
4169  * The main motivation for this function is to pass kernel memory to the
4170  * protocol ioctl callbacks, instead of userspace memory.
4171  */
4172 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4173 {
4174         int rc = 1;
4175
4176         if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4177                 rc = ipmr_sk_ioctl(sk, cmd, arg);
4178         else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4179                 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4180         else if (sk_is_phonet(sk))
4181                 rc = phonet_sk_ioctl(sk, cmd, arg);
4182
4183         /* If ioctl was processed, returns its value */
4184         if (rc <= 0)
4185                 return rc;
4186
4187         /* Otherwise call the default handler */
4188         return sock_ioctl_out(sk, cmd, arg);
4189 }
4190 EXPORT_SYMBOL(sk_ioctl);