net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117 #include <linux/mroute.h>
 118 #include <linux/mroute6.h>
 119 #include <linux/icmpv6.h>
 120
 121 #include <linux/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138 #include <net/bpf_sk_storage.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144 #include <net/phonet/phonet.h>
 145
 146 #include <linux/ethtool.h>
 147
 148 #include "dev.h"
 149
 150 static DEFINE_MUTEX(proto_list_mutex);
 151 static LIST_HEAD(proto_list);
 152
 153 static void sock_def_write_space_wfree(struct sock *sk);
 154 static void sock_def_write_space(struct sock *sk);
 155
 156 /**
 157  * sk_ns_capable - General socket capability test
 158  * @sk: Socket to use a capability on or through
 159  * @user_ns: The user namespace of the capability to use
 160  * @cap: The capability to use
 161  *
 162  * Test to see if the opener of the socket had when the socket was
 163  * created and the current process has the capability @cap in the user
 164  * namespace @user_ns.
 165  */
 166 bool sk_ns_capable(const struct sock *sk,
 167                    struct user_namespace *user_ns, int cap)
 168 {
 169         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 170                 ns_capable(user_ns, cap);
 171 }
 172 EXPORT_SYMBOL(sk_ns_capable);
 173
 174 /**
 175  * sk_capable - Socket global capability test
 176  * @sk: Socket to use a capability on or through
 177  * @cap: The global capability to use
 178  *
 179  * Test to see if the opener of the socket had when the socket was
 180  * created and the current process has the capability @cap in all user
 181  * namespaces.
 182  */
 183 bool sk_capable(const struct sock *sk, int cap)
 184 {
 185         return sk_ns_capable(sk, &init_user_ns, cap);
 186 }
 187 EXPORT_SYMBOL(sk_capable);
 188
 189 /**
 190  * sk_net_capable - Network namespace socket capability test
 191  * @sk: Socket to use a capability on or through
 192  * @cap: The capability to use
 193  *
 194  * Test to see if the opener of the socket had when the socket was created
 195  * and the current process has the capability @cap over the network namespace
 196  * the socket is a member of.
 197  */
 198 bool sk_net_capable(const struct sock *sk, int cap)
 199 {
 200         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 201 }
 202 EXPORT_SYMBOL(sk_net_capable);
 203
 204 /*
 205  * Each address family might have different locking rules, so we have
 206  * one slock key per address family and separate keys for internal and
 207  * userspace sockets.
 208  */
 209 static struct lock_class_key af_family_keys[AF_MAX];
 210 static struct lock_class_key af_family_kern_keys[AF_MAX];
 211 static struct lock_class_key af_family_slock_keys[AF_MAX];
 212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 213
 214 /*
 215  * Make lock validator output more readable. (we pre-construct these
 216  * strings build-time, so that runtime initialization of socket
 217  * locks is fast):
 218  */
 219
 220 #define _sock_locks(x)                                            \
 221   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 222   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 223   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 224   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 225   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 226   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 227   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 228   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 229   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 230   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 231   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 232   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 233   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 234   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 235   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 236   x "AF_MCTP"  , \
 237   x "AF_MAX"
 238
 239 static const char *const af_family_key_strings[AF_MAX+1] = {
 240         _sock_locks("sk_lock-")
 241 };
 242 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("slock-")
 244 };
 245 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("clock-")
 247 };
 248
 249 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-sk_lock-")
 251 };
 252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 253         _sock_locks("k-slock-")
 254 };
 255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 256         _sock_locks("k-clock-")
 257 };
 258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 259         _sock_locks("rlock-")
 260 };
 261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 262         _sock_locks("wlock-")
 263 };
 264 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 265         _sock_locks("elock-")
 266 };
 267
 268 /*
 269  * sk_callback_lock and sk queues locking rules are per-address-family,
 270  * so split the lock classes by using a per-AF key:
 271  */
 272 static struct lock_class_key af_callback_keys[AF_MAX];
 273 static struct lock_class_key af_rlock_keys[AF_MAX];
 274 static struct lock_class_key af_wlock_keys[AF_MAX];
 275 static struct lock_class_key af_elock_keys[AF_MAX];
 276 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 277
 278 /* Run time adjustable parameters. */
 279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280 EXPORT_SYMBOL(sysctl_wmem_max);
 281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282 EXPORT_SYMBOL(sysctl_rmem_max);
 283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286 /* Maximal space eaten by iovec or ancillary data plus some space */
 287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288 EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290 int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 293 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 294
 295 /**
 296  * sk_set_memalloc - sets %SOCK_MEMALLOC
 297  * @sk: socket to set it on
 298  *
 299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300  * It's the responsibility of the admin to adjust min_free_kbytes
 301  * to meet the requirements
 302  */
 303 void sk_set_memalloc(struct sock *sk)
 304 {
 305         sock_set_flag(sk, SOCK_MEMALLOC);
 306         sk->sk_allocation |= __GFP_MEMALLOC;
 307         static_branch_inc(&memalloc_socks_key);
 308 }
 309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311 void sk_clear_memalloc(struct sock *sk)
 312 {
 313         sock_reset_flag(sk, SOCK_MEMALLOC);
 314         sk->sk_allocation &= ~__GFP_MEMALLOC;
 315         static_branch_dec(&memalloc_socks_key);
 316
 317         /*
 318          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319          * progress of swapping. SOCK_MEMALLOC may be cleared while
 320          * it has rmem allocations due to the last swapfile being deactivated
 321          * but there is a risk that the socket is unusable due to exceeding
 322          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323          */
 324         sk_mem_reclaim(sk);
 325 }
 326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329 {
 330         int ret;
 331         unsigned int noreclaim_flag;
 332
 333         /* these should have been dropped before queueing */
 334         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336         noreclaim_flag = memalloc_noreclaim_save();
 337         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 338                                  tcp_v6_do_rcv,
 339                                  tcp_v4_do_rcv,
 340                                  sk, skb);
 341         memalloc_noreclaim_restore(noreclaim_flag);
 342
 343         return ret;
 344 }
 345 EXPORT_SYMBOL(__sk_backlog_rcv);
 346
 347 void sk_error_report(struct sock *sk)
 348 {
 349         sk->sk_error_report(sk);
 350
 351         switch (sk->sk_family) {
 352         case AF_INET:
 353                 fallthrough;
 354         case AF_INET6:
 355                 trace_inet_sk_error_report(sk);
 356                 break;
 357         default:
 358                 break;
 359         }
 360 }
 361 EXPORT_SYMBOL(sk_error_report);
 362
 363 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 364 {
 365         struct __kernel_sock_timeval tv;
 366
 367         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 368                 tv.tv_sec = 0;
 369                 tv.tv_usec = 0;
 370         } else {
 371                 tv.tv_sec = timeo / HZ;
 372                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 373         }
 374
 375         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 376                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 377                 *(struct old_timeval32 *)optval = tv32;
 378                 return sizeof(tv32);
 379         }
 380
 381         if (old_timeval) {
 382                 struct __kernel_old_timeval old_tv;
 383                 old_tv.tv_sec = tv.tv_sec;
 384                 old_tv.tv_usec = tv.tv_usec;
 385                 *(struct __kernel_old_timeval *)optval = old_tv;
 386                 return sizeof(old_tv);
 387         }
 388
 389         *(struct __kernel_sock_timeval *)optval = tv;
 390         return sizeof(tv);
 391 }
 392 EXPORT_SYMBOL(sock_get_timeout);
 393
 394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 395                            sockptr_t optval, int optlen, bool old_timeval)
 396 {
 397         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 398                 struct old_timeval32 tv32;
 399
 400                 if (optlen < sizeof(tv32))
 401                         return -EINVAL;
 402
 403                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 404                         return -EFAULT;
 405                 tv->tv_sec = tv32.tv_sec;
 406                 tv->tv_usec = tv32.tv_usec;
 407         } else if (old_timeval) {
 408                 struct __kernel_old_timeval old_tv;
 409
 410                 if (optlen < sizeof(old_tv))
 411                         return -EINVAL;
 412                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 413                         return -EFAULT;
 414                 tv->tv_sec = old_tv.tv_sec;
 415                 tv->tv_usec = old_tv.tv_usec;
 416         } else {
 417                 if (optlen < sizeof(*tv))
 418                         return -EINVAL;
 419                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 420                         return -EFAULT;
 421         }
 422
 423         return 0;
 424 }
 425 EXPORT_SYMBOL(sock_copy_user_timeval);
 426
 427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 428                             bool old_timeval)
 429 {
 430         struct __kernel_sock_timeval tv;
 431         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 432
 433         if (err)
 434                 return err;
 435
 436         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 437                 return -EDOM;
 438
 439         if (tv.tv_sec < 0) {
 440                 static int warned __read_mostly;
 441
 442                 *timeo_p = 0;
 443                 if (warned < 10 && net_ratelimit()) {
 444                         warned++;
 445                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 446                                 __func__, current->comm, task_pid_nr(current));
 447                 }
 448                 return 0;
 449         }
 450         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 451         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 452                 return 0;
 453         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 454                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 455         return 0;
 456 }
 457
 458 static bool sock_needs_netstamp(const struct sock *sk)
 459 {
 460         switch (sk->sk_family) {
 461         case AF_UNSPEC:
 462         case AF_UNIX:
 463                 return false;
 464         default:
 465                 return true;
 466         }
 467 }
 468
 469 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 470 {
 471         if (sk->sk_flags & flags) {
 472                 sk->sk_flags &= ~flags;
 473                 if (sock_needs_netstamp(sk) &&
 474                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 475                         net_disable_timestamp();
 476         }
 477 }
 478
 479
 480 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 481 {
 482         unsigned long flags;
 483         struct sk_buff_head *list = &sk->sk_receive_queue;
 484
 485         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 486                 atomic_inc(&sk->sk_drops);
 487                 trace_sock_rcvqueue_full(sk, skb);
 488                 return -ENOMEM;
 489         }
 490
 491         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 492                 atomic_inc(&sk->sk_drops);
 493                 return -ENOBUFS;
 494         }
 495
 496         skb->dev = NULL;
 497         skb_set_owner_r(skb, sk);
 498
 499         /* we escape from rcu protected region, make sure we dont leak
 500          * a norefcounted dst
 501          */
 502         skb_dst_force(skb);
 503
 504         spin_lock_irqsave(&list->lock, flags);
 505         sock_skb_set_dropcount(sk, skb);
 506         __skb_queue_tail(list, skb);
 507         spin_unlock_irqrestore(&list->lock, flags);
 508
 509         if (!sock_flag(sk, SOCK_DEAD))
 510                 sk->sk_data_ready(sk);
 511         return 0;
 512 }
 513 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 514
 515 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 516                               enum skb_drop_reason *reason)
 517 {
 518         enum skb_drop_reason drop_reason;
 519         int err;
 520
 521         err = sk_filter(sk, skb);
 522         if (err) {
 523                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 524                 goto out;
 525         }
 526         err = __sock_queue_rcv_skb(sk, skb);
 527         switch (err) {
 528         case -ENOMEM:
 529                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 530                 break;
 531         case -ENOBUFS:
 532                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 533                 break;
 534         default:
 535                 drop_reason = SKB_NOT_DROPPED_YET;
 536                 break;
 537         }
 538 out:
 539         if (reason)
 540                 *reason = drop_reason;
 541         return err;
 542 }
 543 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 544
 545 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 546                      const int nested, unsigned int trim_cap, bool refcounted)
 547 {
 548         int rc = NET_RX_SUCCESS;
 549
 550         if (sk_filter_trim_cap(sk, skb, trim_cap))
 551                 goto discard_and_relse;
 552
 553         skb->dev = NULL;
 554
 555         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 556                 atomic_inc(&sk->sk_drops);
 557                 goto discard_and_relse;
 558         }
 559         if (nested)
 560                 bh_lock_sock_nested(sk);
 561         else
 562                 bh_lock_sock(sk);
 563         if (!sock_owned_by_user(sk)) {
 564                 /*
 565                  * trylock + unlock semantics:
 566                  */
 567                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 568
 569                 rc = sk_backlog_rcv(sk, skb);
 570
 571                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 572         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 573                 bh_unlock_sock(sk);
 574                 atomic_inc(&sk->sk_drops);
 575                 goto discard_and_relse;
 576         }
 577
 578         bh_unlock_sock(sk);
 579 out:
 580         if (refcounted)
 581                 sock_put(sk);
 582         return rc;
 583 discard_and_relse:
 584         kfree_skb(skb);
 585         goto out;
 586 }
 587 EXPORT_SYMBOL(__sk_receive_skb);
 588
 589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 590                                                           u32));
 591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 592                                                            u32));
 593 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 594 {
 595         struct dst_entry *dst = __sk_dst_get(sk);
 596
 597         if (dst && dst->obsolete &&
 598             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 599                                dst, cookie) == NULL) {
 600                 sk_tx_queue_clear(sk);
 601                 sk->sk_dst_pending_confirm = 0;
 602                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 603                 dst_release(dst);
 604                 return NULL;
 605         }
 606
 607         return dst;
 608 }
 609 EXPORT_SYMBOL(__sk_dst_check);
 610
 611 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 612 {
 613         struct dst_entry *dst = sk_dst_get(sk);
 614
 615         if (dst && dst->obsolete &&
 616             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 617                                dst, cookie) == NULL) {
 618                 sk_dst_reset(sk);
 619                 dst_release(dst);
 620                 return NULL;
 621         }
 622
 623         return dst;
 624 }
 625 EXPORT_SYMBOL(sk_dst_check);
 626
 627 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 628 {
 629         int ret = -ENOPROTOOPT;
 630 #ifdef CONFIG_NETDEVICES
 631         struct net *net = sock_net(sk);
 632
 633         /* Sorry... */
 634         ret = -EPERM;
 635         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 636                 goto out;
 637
 638         ret = -EINVAL;
 639         if (ifindex < 0)
 640                 goto out;
 641
 642         /* Paired with all READ_ONCE() done locklessly. */
 643         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 644
 645         if (sk->sk_prot->rehash)
 646                 sk->sk_prot->rehash(sk);
 647         sk_dst_reset(sk);
 648
 649         ret = 0;
 650
 651 out:
 652 #endif
 653
 654         return ret;
 655 }
 656
 657 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 658 {
 659         int ret;
 660
 661         if (lock_sk)
 662                 lock_sock(sk);
 663         ret = sock_bindtoindex_locked(sk, ifindex);
 664         if (lock_sk)
 665                 release_sock(sk);
 666
 667         return ret;
 668 }
 669 EXPORT_SYMBOL(sock_bindtoindex);
 670
 671 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 672 {
 673         int ret = -ENOPROTOOPT;
 674 #ifdef CONFIG_NETDEVICES
 675         struct net *net = sock_net(sk);
 676         char devname[IFNAMSIZ];
 677         int index;
 678
 679         ret = -EINVAL;
 680         if (optlen < 0)
 681                 goto out;
 682
 683         /* Bind this socket to a particular device like "eth0",
 684          * as specified in the passed interface name. If the
 685          * name is "" or the option length is zero the socket
 686          * is not bound.
 687          */
 688         if (optlen > IFNAMSIZ - 1)
 689                 optlen = IFNAMSIZ - 1;
 690         memset(devname, 0, sizeof(devname));
 691
 692         ret = -EFAULT;
 693         if (copy_from_sockptr(devname, optval, optlen))
 694                 goto out;
 695
 696         index = 0;
 697         if (devname[0] != '\0') {
 698                 struct net_device *dev;
 699
 700                 rcu_read_lock();
 701                 dev = dev_get_by_name_rcu(net, devname);
 702                 if (dev)
 703                         index = dev->ifindex;
 704                 rcu_read_unlock();
 705                 ret = -ENODEV;
 706                 if (!dev)
 707                         goto out;
 708         }
 709
 710         sockopt_lock_sock(sk);
 711         ret = sock_bindtoindex_locked(sk, index);
 712         sockopt_release_sock(sk);
 713 out:
 714 #endif
 715
 716         return ret;
 717 }
 718
 719 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 720                                 sockptr_t optlen, int len)
 721 {
 722         int ret = -ENOPROTOOPT;
 723 #ifdef CONFIG_NETDEVICES
 724         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 725         struct net *net = sock_net(sk);
 726         char devname[IFNAMSIZ];
 727
 728         if (bound_dev_if == 0) {
 729                 len = 0;
 730                 goto zero;
 731         }
 732
 733         ret = -EINVAL;
 734         if (len < IFNAMSIZ)
 735                 goto out;
 736
 737         ret = netdev_get_name(net, devname, bound_dev_if);
 738         if (ret)
 739                 goto out;
 740
 741         len = strlen(devname) + 1;
 742
 743         ret = -EFAULT;
 744         if (copy_to_sockptr(optval, devname, len))
 745                 goto out;
 746
 747 zero:
 748         ret = -EFAULT;
 749         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 750                 goto out;
 751
 752         ret = 0;
 753
 754 out:
 755 #endif
 756
 757         return ret;
 758 }
 759
 760 bool sk_mc_loop(struct sock *sk)
 761 {
 762         if (dev_recursion_level())
 763                 return false;
 764         if (!sk)
 765                 return true;
 766         switch (sk->sk_family) {
 767         case AF_INET:
 768                 return inet_sk(sk)->mc_loop;
 769 #if IS_ENABLED(CONFIG_IPV6)
 770         case AF_INET6:
 771                 return inet6_sk(sk)->mc_loop;
 772 #endif
 773         }
 774         WARN_ON_ONCE(1);
 775         return true;
 776 }
 777 EXPORT_SYMBOL(sk_mc_loop);
 778
 779 void sock_set_reuseaddr(struct sock *sk)
 780 {
 781         lock_sock(sk);
 782         sk->sk_reuse = SK_CAN_REUSE;
 783         release_sock(sk);
 784 }
 785 EXPORT_SYMBOL(sock_set_reuseaddr);
 786
 787 void sock_set_reuseport(struct sock *sk)
 788 {
 789         lock_sock(sk);
 790         sk->sk_reuseport = true;
 791         release_sock(sk);
 792 }
 793 EXPORT_SYMBOL(sock_set_reuseport);
 794
 795 void sock_no_linger(struct sock *sk)
 796 {
 797         lock_sock(sk);
 798         sk->sk_lingertime = 0;
 799         sock_set_flag(sk, SOCK_LINGER);
 800         release_sock(sk);
 801 }
 802 EXPORT_SYMBOL(sock_no_linger);
 803
 804 void sock_set_priority(struct sock *sk, u32 priority)
 805 {
 806         lock_sock(sk);
 807         sk->sk_priority = priority;
 808         release_sock(sk);
 809 }
 810 EXPORT_SYMBOL(sock_set_priority);
 811
 812 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 813 {
 814         lock_sock(sk);
 815         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 816                 sk->sk_sndtimeo = secs * HZ;
 817         else
 818                 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 819         release_sock(sk);
 820 }
 821 EXPORT_SYMBOL(sock_set_sndtimeo);
 822
 823 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 824 {
 825         if (val)  {
 826                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 827                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 828                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 829                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 830         } else {
 831                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 832                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 833         }
 834 }
 835
 836 void sock_enable_timestamps(struct sock *sk)
 837 {
 838         lock_sock(sk);
 839         __sock_set_timestamps(sk, true, false, true);
 840         release_sock(sk);
 841 }
 842 EXPORT_SYMBOL(sock_enable_timestamps);
 843
 844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 845 {
 846         switch (optname) {
 847         case SO_TIMESTAMP_OLD:
 848                 __sock_set_timestamps(sk, valbool, false, false);
 849                 break;
 850         case SO_TIMESTAMP_NEW:
 851                 __sock_set_timestamps(sk, valbool, true, false);
 852                 break;
 853         case SO_TIMESTAMPNS_OLD:
 854                 __sock_set_timestamps(sk, valbool, false, true);
 855                 break;
 856         case SO_TIMESTAMPNS_NEW:
 857                 __sock_set_timestamps(sk, valbool, true, true);
 858                 break;
 859         }
 860 }
 861
 862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 863 {
 864         struct net *net = sock_net(sk);
 865         struct net_device *dev = NULL;
 866         bool match = false;
 867         int *vclock_index;
 868         int i, num;
 869
 870         if (sk->sk_bound_dev_if)
 871                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 872
 873         if (!dev) {
 874                 pr_err("%s: sock not bind to device\n", __func__);
 875                 return -EOPNOTSUPP;
 876         }
 877
 878         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 879         dev_put(dev);
 880
 881         for (i = 0; i < num; i++) {
 882                 if (*(vclock_index + i) == phc_index) {
 883                         match = true;
 884                         break;
 885                 }
 886         }
 887
 888         if (num > 0)
 889                 kfree(vclock_index);
 890
 891         if (!match)
 892                 return -EINVAL;
 893
 894         sk->sk_bind_phc = phc_index;
 895
 896         return 0;
 897 }
 898
 899 int sock_set_timestamping(struct sock *sk, int optname,
 900                           struct so_timestamping timestamping)
 901 {
 902         int val = timestamping.flags;
 903         int ret;
 904
 905         if (val & ~SOF_TIMESTAMPING_MASK)
 906                 return -EINVAL;
 907
 908         if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 909             !(val & SOF_TIMESTAMPING_OPT_ID))
 910                 return -EINVAL;
 911
 912         if (val & SOF_TIMESTAMPING_OPT_ID &&
 913             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 914                 if (sk_is_tcp(sk)) {
 915                         if ((1 << sk->sk_state) &
 916                             (TCPF_CLOSE | TCPF_LISTEN))
 917                                 return -EINVAL;
 918                         if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 919                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 920                         else
 921                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 922                 } else {
 923                         atomic_set(&sk->sk_tskey, 0);
 924                 }
 925         }
 926
 927         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 928             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 929                 return -EINVAL;
 930
 931         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 932                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 933                 if (ret)
 934                         return ret;
 935         }
 936
 937         sk->sk_tsflags = val;
 938         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 939
 940         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 941                 sock_enable_timestamp(sk,
 942                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 943         else
 944                 sock_disable_timestamp(sk,
 945                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 946         return 0;
 947 }
 948
 949 void sock_set_keepalive(struct sock *sk)
 950 {
 951         lock_sock(sk);
 952         if (sk->sk_prot->keepalive)
 953                 sk->sk_prot->keepalive(sk, true);
 954         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 955         release_sock(sk);
 956 }
 957 EXPORT_SYMBOL(sock_set_keepalive);
 958
 959 static void __sock_set_rcvbuf(struct sock *sk, int val)
 960 {
 961         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 962          * as a negative value.
 963          */
 964         val = min_t(int, val, INT_MAX / 2);
 965         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 966
 967         /* We double it on the way in to account for "struct sk_buff" etc.
 968          * overhead.   Applications assume that the SO_RCVBUF setting they make
 969          * will allow that much actual data to be received on that socket.
 970          *
 971          * Applications are unaware that "struct sk_buff" and other overheads
 972          * allocate from the receive buffer during socket buffer allocation.
 973          *
 974          * And after considering the possible alternatives, returning the value
 975          * we actually used in getsockopt is the most desirable behavior.
 976          */
 977         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 978 }
 979
 980 void sock_set_rcvbuf(struct sock *sk, int val)
 981 {
 982         lock_sock(sk);
 983         __sock_set_rcvbuf(sk, val);
 984         release_sock(sk);
 985 }
 986 EXPORT_SYMBOL(sock_set_rcvbuf);
 987
 988 static void __sock_set_mark(struct sock *sk, u32 val)
 989 {
 990         if (val != sk->sk_mark) {
 991                 sk->sk_mark = val;
 992                 sk_dst_reset(sk);
 993         }
 994 }
 995
 996 void sock_set_mark(struct sock *sk, u32 val)
 997 {
 998         lock_sock(sk);
 999         __sock_set_mark(sk, val);
1000         release_sock(sk);
1001 }
1002 EXPORT_SYMBOL(sock_set_mark);
1003
1004 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1005 {
1006         /* Round down bytes to multiple of pages */
1007         bytes = round_down(bytes, PAGE_SIZE);
1008
1009         WARN_ON(bytes > sk->sk_reserved_mem);
1010         sk->sk_reserved_mem -= bytes;
1011         sk_mem_reclaim(sk);
1012 }
1013
1014 static int sock_reserve_memory(struct sock *sk, int bytes)
1015 {
1016         long allocated;
1017         bool charged;
1018         int pages;
1019
1020         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1021                 return -EOPNOTSUPP;
1022
1023         if (!bytes)
1024                 return 0;
1025
1026         pages = sk_mem_pages(bytes);
1027
1028         /* pre-charge to memcg */
1029         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1030                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1031         if (!charged)
1032                 return -ENOMEM;
1033
1034         /* pre-charge to forward_alloc */
1035         sk_memory_allocated_add(sk, pages);
1036         allocated = sk_memory_allocated(sk);
1037         /* If the system goes into memory pressure with this
1038          * precharge, give up and return error.
1039          */
1040         if (allocated > sk_prot_mem_limits(sk, 1)) {
1041                 sk_memory_allocated_sub(sk, pages);
1042                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1043                 return -ENOMEM;
1044         }
1045         sk->sk_forward_alloc += pages << PAGE_SHIFT;
1046
1047         sk->sk_reserved_mem += pages << PAGE_SHIFT;
1048
1049         return 0;
1050 }
1051
1052 void sockopt_lock_sock(struct sock *sk)
1053 {
1054         /* When current->bpf_ctx is set, the setsockopt is called from
1055          * a bpf prog.  bpf has ensured the sk lock has been
1056          * acquired before calling setsockopt().
1057          */
1058         if (has_current_bpf_ctx())
1059                 return;
1060
1061         lock_sock(sk);
1062 }
1063 EXPORT_SYMBOL(sockopt_lock_sock);
1064
1065 void sockopt_release_sock(struct sock *sk)
1066 {
1067         if (has_current_bpf_ctx())
1068                 return;
1069
1070         release_sock(sk);
1071 }
1072 EXPORT_SYMBOL(sockopt_release_sock);
1073
1074 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1075 {
1076         return has_current_bpf_ctx() || ns_capable(ns, cap);
1077 }
1078 EXPORT_SYMBOL(sockopt_ns_capable);
1079
1080 bool sockopt_capable(int cap)
1081 {
1082         return has_current_bpf_ctx() || capable(cap);
1083 }
1084 EXPORT_SYMBOL(sockopt_capable);
1085
1086 /*
1087  *      This is meant for all protocols to use and covers goings on
1088  *      at the socket level. Everything here is generic.
1089  */
1090
1091 int sk_setsockopt(struct sock *sk, int level, int optname,
1092                   sockptr_t optval, unsigned int optlen)
1093 {
1094         struct so_timestamping timestamping;
1095         struct socket *sock = sk->sk_socket;
1096         struct sock_txtime sk_txtime;
1097         int val;
1098         int valbool;
1099         struct linger ling;
1100         int ret = 0;
1101
1102         /*
1103          *      Options without arguments
1104          */
1105
1106         if (optname == SO_BINDTODEVICE)
1107                 return sock_setbindtodevice(sk, optval, optlen);
1108
1109         if (optlen < sizeof(int))
1110                 return -EINVAL;
1111
1112         if (copy_from_sockptr(&val, optval, sizeof(val)))
1113                 return -EFAULT;
1114
1115         valbool = val ? 1 : 0;
1116
1117         sockopt_lock_sock(sk);
1118
1119         switch (optname) {
1120         case SO_DEBUG:
1121                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1122                         ret = -EACCES;
1123                 else
1124                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1125                 break;
1126         case SO_REUSEADDR:
1127                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1128                 break;
1129         case SO_REUSEPORT:
1130                 sk->sk_reuseport = valbool;
1131                 break;
1132         case SO_TYPE:
1133         case SO_PROTOCOL:
1134         case SO_DOMAIN:
1135         case SO_ERROR:
1136                 ret = -ENOPROTOOPT;
1137                 break;
1138         case SO_DONTROUTE:
1139                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1140                 sk_dst_reset(sk);
1141                 break;
1142         case SO_BROADCAST:
1143                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1144                 break;
1145         case SO_SNDBUF:
1146                 /* Don't error on this BSD doesn't and if you think
1147                  * about it this is right. Otherwise apps have to
1148                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1149                  * are treated in BSD as hints
1150                  */
1151                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1152 set_sndbuf:
1153                 /* Ensure val * 2 fits into an int, to prevent max_t()
1154                  * from treating it as a negative value.
1155                  */
1156                 val = min_t(int, val, INT_MAX / 2);
1157                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1158                 WRITE_ONCE(sk->sk_sndbuf,
1159                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1160                 /* Wake up sending tasks if we upped the value. */
1161                 sk->sk_write_space(sk);
1162                 break;
1163
1164         case SO_SNDBUFFORCE:
1165                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1166                         ret = -EPERM;
1167                         break;
1168                 }
1169
1170                 /* No negative values (to prevent underflow, as val will be
1171                  * multiplied by 2).
1172                  */
1173                 if (val < 0)
1174                         val = 0;
1175                 goto set_sndbuf;
1176
1177         case SO_RCVBUF:
1178                 /* Don't error on this BSD doesn't and if you think
1179                  * about it this is right. Otherwise apps have to
1180                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1181                  * are treated in BSD as hints
1182                  */
1183                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1184                 break;
1185
1186         case SO_RCVBUFFORCE:
1187                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1188                         ret = -EPERM;
1189                         break;
1190                 }
1191
1192                 /* No negative values (to prevent underflow, as val will be
1193                  * multiplied by 2).
1194                  */
1195                 __sock_set_rcvbuf(sk, max(val, 0));
1196                 break;
1197
1198         case SO_KEEPALIVE:
1199                 if (sk->sk_prot->keepalive)
1200                         sk->sk_prot->keepalive(sk, valbool);
1201                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1202                 break;
1203
1204         case SO_OOBINLINE:
1205                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1206                 break;
1207
1208         case SO_NO_CHECK:
1209                 sk->sk_no_check_tx = valbool;
1210                 break;
1211
1212         case SO_PRIORITY:
1213                 if ((val >= 0 && val <= 6) ||
1214                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1215                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1216                         sk->sk_priority = val;
1217                 else
1218                         ret = -EPERM;
1219                 break;
1220
1221         case SO_LINGER:
1222                 if (optlen < sizeof(ling)) {
1223                         ret = -EINVAL;  /* 1003.1g */
1224                         break;
1225                 }
1226                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1227                         ret = -EFAULT;
1228                         break;
1229                 }
1230                 if (!ling.l_onoff)
1231                         sock_reset_flag(sk, SOCK_LINGER);
1232                 else {
1233 #if (BITS_PER_LONG == 32)
1234                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1235                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1236                         else
1237 #endif
1238                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1239                         sock_set_flag(sk, SOCK_LINGER);
1240                 }
1241                 break;
1242
1243         case SO_BSDCOMPAT:
1244                 break;
1245
1246         case SO_PASSCRED:
1247                 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1248                 break;
1249
1250         case SO_PASSPIDFD:
1251                 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1252                 break;
1253
1254         case SO_TIMESTAMP_OLD:
1255         case SO_TIMESTAMP_NEW:
1256         case SO_TIMESTAMPNS_OLD:
1257         case SO_TIMESTAMPNS_NEW:
1258                 sock_set_timestamp(sk, optname, valbool);
1259                 break;
1260
1261         case SO_TIMESTAMPING_NEW:
1262         case SO_TIMESTAMPING_OLD:
1263                 if (optlen == sizeof(timestamping)) {
1264                         if (copy_from_sockptr(&timestamping, optval,
1265                                               sizeof(timestamping))) {
1266                                 ret = -EFAULT;
1267                                 break;
1268                         }
1269                 } else {
1270                         memset(&timestamping, 0, sizeof(timestamping));
1271                         timestamping.flags = val;
1272                 }
1273                 ret = sock_set_timestamping(sk, optname, timestamping);
1274                 break;
1275
1276         case SO_RCVLOWAT:
1277                 if (val < 0)
1278                         val = INT_MAX;
1279                 if (sock && sock->ops->set_rcvlowat)
1280                         ret = sock->ops->set_rcvlowat(sk, val);
1281                 else
1282                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1283                 break;
1284
1285         case SO_RCVTIMEO_OLD:
1286         case SO_RCVTIMEO_NEW:
1287                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1288                                        optlen, optname == SO_RCVTIMEO_OLD);
1289                 break;
1290
1291         case SO_SNDTIMEO_OLD:
1292         case SO_SNDTIMEO_NEW:
1293                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1294                                        optlen, optname == SO_SNDTIMEO_OLD);
1295                 break;
1296
1297         case SO_ATTACH_FILTER: {
1298                 struct sock_fprog fprog;
1299
1300                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1301                 if (!ret)
1302                         ret = sk_attach_filter(&fprog, sk);
1303                 break;
1304         }
1305         case SO_ATTACH_BPF:
1306                 ret = -EINVAL;
1307                 if (optlen == sizeof(u32)) {
1308                         u32 ufd;
1309
1310                         ret = -EFAULT;
1311                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1312                                 break;
1313
1314                         ret = sk_attach_bpf(ufd, sk);
1315                 }
1316                 break;
1317
1318         case SO_ATTACH_REUSEPORT_CBPF: {
1319                 struct sock_fprog fprog;
1320
1321                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1322                 if (!ret)
1323                         ret = sk_reuseport_attach_filter(&fprog, sk);
1324                 break;
1325         }
1326         case SO_ATTACH_REUSEPORT_EBPF:
1327                 ret = -EINVAL;
1328                 if (optlen == sizeof(u32)) {
1329                         u32 ufd;
1330
1331                         ret = -EFAULT;
1332                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1333                                 break;
1334
1335                         ret = sk_reuseport_attach_bpf(ufd, sk);
1336                 }
1337                 break;
1338
1339         case SO_DETACH_REUSEPORT_BPF:
1340                 ret = reuseport_detach_prog(sk);
1341                 break;
1342
1343         case SO_DETACH_FILTER:
1344                 ret = sk_detach_filter(sk);
1345                 break;
1346
1347         case SO_LOCK_FILTER:
1348                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1349                         ret = -EPERM;
1350                 else
1351                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1352                 break;
1353
1354         case SO_PASSSEC:
1355                 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1356                 break;
1357         case SO_MARK:
1358                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1359                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1360                         ret = -EPERM;
1361                         break;
1362                 }
1363
1364                 __sock_set_mark(sk, val);
1365                 break;
1366         case SO_RCVMARK:
1367                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1368                 break;
1369
1370         case SO_RXQ_OVFL:
1371                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1372                 break;
1373
1374         case SO_WIFI_STATUS:
1375                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1376                 break;
1377
1378         case SO_PEEK_OFF:
1379                 if (sock->ops->set_peek_off)
1380                         ret = sock->ops->set_peek_off(sk, val);
1381                 else
1382                         ret = -EOPNOTSUPP;
1383                 break;
1384
1385         case SO_NOFCS:
1386                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1387                 break;
1388
1389         case SO_SELECT_ERR_QUEUE:
1390                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1391                 break;
1392
1393 #ifdef CONFIG_NET_RX_BUSY_POLL
1394         case SO_BUSY_POLL:
1395                 if (val < 0)
1396                         ret = -EINVAL;
1397                 else
1398                         WRITE_ONCE(sk->sk_ll_usec, val);
1399                 break;
1400         case SO_PREFER_BUSY_POLL:
1401                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1402                         ret = -EPERM;
1403                 else
1404                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1405                 break;
1406         case SO_BUSY_POLL_BUDGET:
1407                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1408                         ret = -EPERM;
1409                 } else {
1410                         if (val < 0 || val > U16_MAX)
1411                                 ret = -EINVAL;
1412                         else
1413                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1414                 }
1415                 break;
1416 #endif
1417
1418         case SO_MAX_PACING_RATE:
1419                 {
1420                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1421
1422                 if (sizeof(ulval) != sizeof(val) &&
1423                     optlen >= sizeof(ulval) &&
1424                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1425                         ret = -EFAULT;
1426                         break;
1427                 }
1428                 if (ulval != ~0UL)
1429                         cmpxchg(&sk->sk_pacing_status,
1430                                 SK_PACING_NONE,
1431                                 SK_PACING_NEEDED);
1432                 sk->sk_max_pacing_rate = ulval;
1433                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1434                 break;
1435                 }
1436         case SO_INCOMING_CPU:
1437                 reuseport_update_incoming_cpu(sk, val);
1438                 break;
1439
1440         case SO_CNX_ADVICE:
1441                 if (val == 1)
1442                         dst_negative_advice(sk);
1443                 break;
1444
1445         case SO_ZEROCOPY:
1446                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1447                         if (!(sk_is_tcp(sk) ||
1448                               (sk->sk_type == SOCK_DGRAM &&
1449                                sk->sk_protocol == IPPROTO_UDP)))
1450                                 ret = -EOPNOTSUPP;
1451                 } else if (sk->sk_family != PF_RDS) {
1452                         ret = -EOPNOTSUPP;
1453                 }
1454                 if (!ret) {
1455                         if (val < 0 || val > 1)
1456                                 ret = -EINVAL;
1457                         else
1458                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1459                 }
1460                 break;
1461
1462         case SO_TXTIME:
1463                 if (optlen != sizeof(struct sock_txtime)) {
1464                         ret = -EINVAL;
1465                         break;
1466                 } else if (copy_from_sockptr(&sk_txtime, optval,
1467                            sizeof(struct sock_txtime))) {
1468                         ret = -EFAULT;
1469                         break;
1470                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1471                         ret = -EINVAL;
1472                         break;
1473                 }
1474                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1475                  * scheduler has enough safe guards.
1476                  */
1477                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1478                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1479                         ret = -EPERM;
1480                         break;
1481                 }
1482                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1483                 sk->sk_clockid = sk_txtime.clockid;
1484                 sk->sk_txtime_deadline_mode =
1485                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1486                 sk->sk_txtime_report_errors =
1487                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1488                 break;
1489
1490         case SO_BINDTOIFINDEX:
1491                 ret = sock_bindtoindex_locked(sk, val);
1492                 break;
1493
1494         case SO_BUF_LOCK:
1495                 if (val & ~SOCK_BUF_LOCK_MASK) {
1496                         ret = -EINVAL;
1497                         break;
1498                 }
1499                 sk->sk_userlocks = val | (sk->sk_userlocks &
1500                                           ~SOCK_BUF_LOCK_MASK);
1501                 break;
1502
1503         case SO_RESERVE_MEM:
1504         {
1505                 int delta;
1506
1507                 if (val < 0) {
1508                         ret = -EINVAL;
1509                         break;
1510                 }
1511
1512                 delta = val - sk->sk_reserved_mem;
1513                 if (delta < 0)
1514                         sock_release_reserved_memory(sk, -delta);
1515                 else
1516                         ret = sock_reserve_memory(sk, delta);
1517                 break;
1518         }
1519
1520         case SO_TXREHASH:
1521                 if (val < -1 || val > 1) {
1522                         ret = -EINVAL;
1523                         break;
1524                 }
1525                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1526                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1527                 /* Paired with READ_ONCE() in tcp_rtx_synack() */
1528                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1529                 break;
1530
1531         default:
1532                 ret = -ENOPROTOOPT;
1533                 break;
1534         }
1535         sockopt_release_sock(sk);
1536         return ret;
1537 }
1538
1539 int sock_setsockopt(struct socket *sock, int level, int optname,
1540                     sockptr_t optval, unsigned int optlen)
1541 {
1542         return sk_setsockopt(sock->sk, level, optname,
1543                              optval, optlen);
1544 }
1545 EXPORT_SYMBOL(sock_setsockopt);
1546
1547 static const struct cred *sk_get_peer_cred(struct sock *sk)
1548 {
1549         const struct cred *cred;
1550
1551         spin_lock(&sk->sk_peer_lock);
1552         cred = get_cred(sk->sk_peer_cred);
1553         spin_unlock(&sk->sk_peer_lock);
1554
1555         return cred;
1556 }
1557
1558 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1559                           struct ucred *ucred)
1560 {
1561         ucred->pid = pid_vnr(pid);
1562         ucred->uid = ucred->gid = -1;
1563         if (cred) {
1564                 struct user_namespace *current_ns = current_user_ns();
1565
1566                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1567                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1568         }
1569 }
1570
1571 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1572 {
1573         struct user_namespace *user_ns = current_user_ns();
1574         int i;
1575
1576         for (i = 0; i < src->ngroups; i++) {
1577                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1578
1579                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1580                         return -EFAULT;
1581         }
1582
1583         return 0;
1584 }
1585
1586 int sk_getsockopt(struct sock *sk, int level, int optname,
1587                   sockptr_t optval, sockptr_t optlen)
1588 {
1589         struct socket *sock = sk->sk_socket;
1590
1591         union {
1592                 int val;
1593                 u64 val64;
1594                 unsigned long ulval;
1595                 struct linger ling;
1596                 struct old_timeval32 tm32;
1597                 struct __kernel_old_timeval tm;
1598                 struct  __kernel_sock_timeval stm;
1599                 struct sock_txtime txtime;
1600                 struct so_timestamping timestamping;
1601         } v;
1602
1603         int lv = sizeof(int);
1604         int len;
1605
1606         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1607                 return -EFAULT;
1608         if (len < 0)
1609                 return -EINVAL;
1610
1611         memset(&v, 0, sizeof(v));
1612
1613         switch (optname) {
1614         case SO_DEBUG:
1615                 v.val = sock_flag(sk, SOCK_DBG);
1616                 break;
1617
1618         case SO_DONTROUTE:
1619                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1620                 break;
1621
1622         case SO_BROADCAST:
1623                 v.val = sock_flag(sk, SOCK_BROADCAST);
1624                 break;
1625
1626         case SO_SNDBUF:
1627                 v.val = sk->sk_sndbuf;
1628                 break;
1629
1630         case SO_RCVBUF:
1631                 v.val = sk->sk_rcvbuf;
1632                 break;
1633
1634         case SO_REUSEADDR:
1635                 v.val = sk->sk_reuse;
1636                 break;
1637
1638         case SO_REUSEPORT:
1639                 v.val = sk->sk_reuseport;
1640                 break;
1641
1642         case SO_KEEPALIVE:
1643                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1644                 break;
1645
1646         case SO_TYPE:
1647                 v.val = sk->sk_type;
1648                 break;
1649
1650         case SO_PROTOCOL:
1651                 v.val = sk->sk_protocol;
1652                 break;
1653
1654         case SO_DOMAIN:
1655                 v.val = sk->sk_family;
1656                 break;
1657
1658         case SO_ERROR:
1659                 v.val = -sock_error(sk);
1660                 if (v.val == 0)
1661                         v.val = xchg(&sk->sk_err_soft, 0);
1662                 break;
1663
1664         case SO_OOBINLINE:
1665                 v.val = sock_flag(sk, SOCK_URGINLINE);
1666                 break;
1667
1668         case SO_NO_CHECK:
1669                 v.val = sk->sk_no_check_tx;
1670                 break;
1671
1672         case SO_PRIORITY:
1673                 v.val = sk->sk_priority;
1674                 break;
1675
1676         case SO_LINGER:
1677                 lv              = sizeof(v.ling);
1678                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1679                 v.ling.l_linger = sk->sk_lingertime / HZ;
1680                 break;
1681
1682         case SO_BSDCOMPAT:
1683                 break;
1684
1685         case SO_TIMESTAMP_OLD:
1686                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1687                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1688                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1689                 break;
1690
1691         case SO_TIMESTAMPNS_OLD:
1692                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1693                 break;
1694
1695         case SO_TIMESTAMP_NEW:
1696                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1697                 break;
1698
1699         case SO_TIMESTAMPNS_NEW:
1700                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1701                 break;
1702
1703         case SO_TIMESTAMPING_OLD:
1704                 lv = sizeof(v.timestamping);
1705                 v.timestamping.flags = sk->sk_tsflags;
1706                 v.timestamping.bind_phc = sk->sk_bind_phc;
1707                 break;
1708
1709         case SO_RCVTIMEO_OLD:
1710         case SO_RCVTIMEO_NEW:
1711                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1712                 break;
1713
1714         case SO_SNDTIMEO_OLD:
1715         case SO_SNDTIMEO_NEW:
1716                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1717                 break;
1718
1719         case SO_RCVLOWAT:
1720                 v.val = sk->sk_rcvlowat;
1721                 break;
1722
1723         case SO_SNDLOWAT:
1724                 v.val = 1;
1725                 break;
1726
1727         case SO_PASSCRED:
1728                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1729                 break;
1730
1731         case SO_PASSPIDFD:
1732                 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1733                 break;
1734
1735         case SO_PEERCRED:
1736         {
1737                 struct ucred peercred;
1738                 if (len > sizeof(peercred))
1739                         len = sizeof(peercred);
1740
1741                 spin_lock(&sk->sk_peer_lock);
1742                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1743                 spin_unlock(&sk->sk_peer_lock);
1744
1745                 if (copy_to_sockptr(optval, &peercred, len))
1746                         return -EFAULT;
1747                 goto lenout;
1748         }
1749
1750         case SO_PEERPIDFD:
1751         {
1752                 struct pid *peer_pid;
1753                 struct file *pidfd_file = NULL;
1754                 int pidfd;
1755
1756                 if (len > sizeof(pidfd))
1757                         len = sizeof(pidfd);
1758
1759                 spin_lock(&sk->sk_peer_lock);
1760                 peer_pid = get_pid(sk->sk_peer_pid);
1761                 spin_unlock(&sk->sk_peer_lock);
1762
1763                 if (!peer_pid)
1764                         return -ESRCH;
1765
1766                 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1767                 put_pid(peer_pid);
1768                 if (pidfd < 0)
1769                         return pidfd;
1770
1771                 if (copy_to_sockptr(optval, &pidfd, len) ||
1772                     copy_to_sockptr(optlen, &len, sizeof(int))) {
1773                         put_unused_fd(pidfd);
1774                         fput(pidfd_file);
1775
1776                         return -EFAULT;
1777                 }
1778
1779                 fd_install(pidfd, pidfd_file);
1780                 return 0;
1781         }
1782
1783         case SO_PEERGROUPS:
1784         {
1785                 const struct cred *cred;
1786                 int ret, n;
1787
1788                 cred = sk_get_peer_cred(sk);
1789                 if (!cred)
1790                         return -ENODATA;
1791
1792                 n = cred->group_info->ngroups;
1793                 if (len < n * sizeof(gid_t)) {
1794                         len = n * sizeof(gid_t);
1795                         put_cred(cred);
1796                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1797                 }
1798                 len = n * sizeof(gid_t);
1799
1800                 ret = groups_to_user(optval, cred->group_info);
1801                 put_cred(cred);
1802                 if (ret)
1803                         return ret;
1804                 goto lenout;
1805         }
1806
1807         case SO_PEERNAME:
1808         {
1809                 char address[128];
1810
1811                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1812                 if (lv < 0)
1813                         return -ENOTCONN;
1814                 if (lv < len)
1815                         return -EINVAL;
1816                 if (copy_to_sockptr(optval, address, len))
1817                         return -EFAULT;
1818                 goto lenout;
1819         }
1820
1821         /* Dubious BSD thing... Probably nobody even uses it, but
1822          * the UNIX standard wants it for whatever reason... -DaveM
1823          */
1824         case SO_ACCEPTCONN:
1825                 v.val = sk->sk_state == TCP_LISTEN;
1826                 break;
1827
1828         case SO_PASSSEC:
1829                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1830                 break;
1831
1832         case SO_PEERSEC:
1833                 return security_socket_getpeersec_stream(sock,
1834                                                          optval, optlen, len);
1835
1836         case SO_MARK:
1837                 v.val = sk->sk_mark;
1838                 break;
1839
1840         case SO_RCVMARK:
1841                 v.val = sock_flag(sk, SOCK_RCVMARK);
1842                 break;
1843
1844         case SO_RXQ_OVFL:
1845                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1846                 break;
1847
1848         case SO_WIFI_STATUS:
1849                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1850                 break;
1851
1852         case SO_PEEK_OFF:
1853                 if (!sock->ops->set_peek_off)
1854                         return -EOPNOTSUPP;
1855
1856                 v.val = sk->sk_peek_off;
1857                 break;
1858         case SO_NOFCS:
1859                 v.val = sock_flag(sk, SOCK_NOFCS);
1860                 break;
1861
1862         case SO_BINDTODEVICE:
1863                 return sock_getbindtodevice(sk, optval, optlen, len);
1864
1865         case SO_GET_FILTER:
1866                 len = sk_get_filter(sk, optval, len);
1867                 if (len < 0)
1868                         return len;
1869
1870                 goto lenout;
1871
1872         case SO_LOCK_FILTER:
1873                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1874                 break;
1875
1876         case SO_BPF_EXTENSIONS:
1877                 v.val = bpf_tell_extensions();
1878                 break;
1879
1880         case SO_SELECT_ERR_QUEUE:
1881                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1882                 break;
1883
1884 #ifdef CONFIG_NET_RX_BUSY_POLL
1885         case SO_BUSY_POLL:
1886                 v.val = sk->sk_ll_usec;
1887                 break;
1888         case SO_PREFER_BUSY_POLL:
1889                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1890                 break;
1891 #endif
1892
1893         case SO_MAX_PACING_RATE:
1894                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1895                         lv = sizeof(v.ulval);
1896                         v.ulval = sk->sk_max_pacing_rate;
1897                 } else {
1898                         /* 32bit version */
1899                         v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1900                 }
1901                 break;
1902
1903         case SO_INCOMING_CPU:
1904                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1905                 break;
1906
1907         case SO_MEMINFO:
1908         {
1909                 u32 meminfo[SK_MEMINFO_VARS];
1910
1911                 sk_get_meminfo(sk, meminfo);
1912
1913                 len = min_t(unsigned int, len, sizeof(meminfo));
1914                 if (copy_to_sockptr(optval, &meminfo, len))
1915                         return -EFAULT;
1916
1917                 goto lenout;
1918         }
1919
1920 #ifdef CONFIG_NET_RX_BUSY_POLL
1921         case SO_INCOMING_NAPI_ID:
1922                 v.val = READ_ONCE(sk->sk_napi_id);
1923
1924                 /* aggregate non-NAPI IDs down to 0 */
1925                 if (v.val < MIN_NAPI_ID)
1926                         v.val = 0;
1927
1928                 break;
1929 #endif
1930
1931         case SO_COOKIE:
1932                 lv = sizeof(u64);
1933                 if (len < lv)
1934                         return -EINVAL;
1935                 v.val64 = sock_gen_cookie(sk);
1936                 break;
1937
1938         case SO_ZEROCOPY:
1939                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1940                 break;
1941
1942         case SO_TXTIME:
1943                 lv = sizeof(v.txtime);
1944                 v.txtime.clockid = sk->sk_clockid;
1945                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1946                                   SOF_TXTIME_DEADLINE_MODE : 0;
1947                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1948                                   SOF_TXTIME_REPORT_ERRORS : 0;
1949                 break;
1950
1951         case SO_BINDTOIFINDEX:
1952                 v.val = READ_ONCE(sk->sk_bound_dev_if);
1953                 break;
1954
1955         case SO_NETNS_COOKIE:
1956                 lv = sizeof(u64);
1957                 if (len != lv)
1958                         return -EINVAL;
1959                 v.val64 = sock_net(sk)->net_cookie;
1960                 break;
1961
1962         case SO_BUF_LOCK:
1963                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1964                 break;
1965
1966         case SO_RESERVE_MEM:
1967                 v.val = sk->sk_reserved_mem;
1968                 break;
1969
1970         case SO_TXREHASH:
1971                 v.val = sk->sk_txrehash;
1972                 break;
1973
1974         default:
1975                 /* We implement the SO_SNDLOWAT etc to not be settable
1976                  * (1003.1g 7).
1977                  */
1978                 return -ENOPROTOOPT;
1979         }
1980
1981         if (len > lv)
1982                 len = lv;
1983         if (copy_to_sockptr(optval, &v, len))
1984                 return -EFAULT;
1985 lenout:
1986         if (copy_to_sockptr(optlen, &len, sizeof(int)))
1987                 return -EFAULT;
1988         return 0;
1989 }
1990
1991 int sock_getsockopt(struct socket *sock, int level, int optname,
1992                     char __user *optval, int __user *optlen)
1993 {
1994         return sk_getsockopt(sock->sk, level, optname,
1995                              USER_SOCKPTR(optval),
1996                              USER_SOCKPTR(optlen));
1997 }
1998
1999 /*
2000  * Initialize an sk_lock.
2001  *
2002  * (We also register the sk_lock with the lock validator.)
2003  */
2004 static inline void sock_lock_init(struct sock *sk)
2005 {
2006         if (sk->sk_kern_sock)
2007                 sock_lock_init_class_and_name(
2008                         sk,
2009                         af_family_kern_slock_key_strings[sk->sk_family],
2010                         af_family_kern_slock_keys + sk->sk_family,
2011                         af_family_kern_key_strings[sk->sk_family],
2012                         af_family_kern_keys + sk->sk_family);
2013         else
2014                 sock_lock_init_class_and_name(
2015                         sk,
2016                         af_family_slock_key_strings[sk->sk_family],
2017                         af_family_slock_keys + sk->sk_family,
2018                         af_family_key_strings[sk->sk_family],
2019                         af_family_keys + sk->sk_family);
2020 }
2021
2022 /*
2023  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2024  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2025  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2026  */
2027 static void sock_copy(struct sock *nsk, const struct sock *osk)
2028 {
2029         const struct proto *prot = READ_ONCE(osk->sk_prot);
2030 #ifdef CONFIG_SECURITY_NETWORK
2031         void *sptr = nsk->sk_security;
2032 #endif
2033
2034         /* If we move sk_tx_queue_mapping out of the private section,
2035          * we must check if sk_tx_queue_clear() is called after
2036          * sock_copy() in sk_clone_lock().
2037          */
2038         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2039                      offsetof(struct sock, sk_dontcopy_begin) ||
2040                      offsetof(struct sock, sk_tx_queue_mapping) >=
2041                      offsetof(struct sock, sk_dontcopy_end));
2042
2043         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2044
2045         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2046                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2047
2048 #ifdef CONFIG_SECURITY_NETWORK
2049         nsk->sk_security = sptr;
2050         security_sk_clone(osk, nsk);
2051 #endif
2052 }
2053
2054 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2055                 int family)
2056 {
2057         struct sock *sk;
2058         struct kmem_cache *slab;
2059
2060         slab = prot->slab;
2061         if (slab != NULL) {
2062                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2063                 if (!sk)
2064                         return sk;
2065                 if (want_init_on_alloc(priority))
2066                         sk_prot_clear_nulls(sk, prot->obj_size);
2067         } else
2068                 sk = kmalloc(prot->obj_size, priority);
2069
2070         if (sk != NULL) {
2071                 if (security_sk_alloc(sk, family, priority))
2072                         goto out_free;
2073
2074                 if (!try_module_get(prot->owner))
2075                         goto out_free_sec;
2076         }
2077
2078         return sk;
2079
2080 out_free_sec:
2081         security_sk_free(sk);
2082 out_free:
2083         if (slab != NULL)
2084                 kmem_cache_free(slab, sk);
2085         else
2086                 kfree(sk);
2087         return NULL;
2088 }
2089
2090 static void sk_prot_free(struct proto *prot, struct sock *sk)
2091 {
2092         struct kmem_cache *slab;
2093         struct module *owner;
2094
2095         owner = prot->owner;
2096         slab = prot->slab;
2097
2098         cgroup_sk_free(&sk->sk_cgrp_data);
2099         mem_cgroup_sk_free(sk);
2100         security_sk_free(sk);
2101         if (slab != NULL)
2102                 kmem_cache_free(slab, sk);
2103         else
2104                 kfree(sk);
2105         module_put(owner);
2106 }
2107
2108 /**
2109  *      sk_alloc - All socket objects are allocated here
2110  *      @net: the applicable net namespace
2111  *      @family: protocol family
2112  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2113  *      @prot: struct proto associated with this new sock instance
2114  *      @kern: is this to be a kernel socket?
2115  */
2116 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2117                       struct proto *prot, int kern)
2118 {
2119         struct sock *sk;
2120
2121         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2122         if (sk) {
2123                 sk->sk_family = family;
2124                 /*
2125                  * See comment in struct sock definition to understand
2126                  * why we need sk_prot_creator -acme
2127                  */
2128                 sk->sk_prot = sk->sk_prot_creator = prot;
2129                 sk->sk_kern_sock = kern;
2130                 sock_lock_init(sk);
2131                 sk->sk_net_refcnt = kern ? 0 : 1;
2132                 if (likely(sk->sk_net_refcnt)) {
2133                         get_net_track(net, &sk->ns_tracker, priority);
2134                         sock_inuse_add(net, 1);
2135                 } else {
2136                         __netns_tracker_alloc(net, &sk->ns_tracker,
2137                                               false, priority);
2138                 }
2139
2140                 sock_net_set(sk, net);
2141                 refcount_set(&sk->sk_wmem_alloc, 1);
2142
2143                 mem_cgroup_sk_alloc(sk);
2144                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2145                 sock_update_classid(&sk->sk_cgrp_data);
2146                 sock_update_netprioidx(&sk->sk_cgrp_data);
2147                 sk_tx_queue_clear(sk);
2148         }
2149
2150         return sk;
2151 }
2152 EXPORT_SYMBOL(sk_alloc);
2153
2154 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2155  * grace period. This is the case for UDP sockets and TCP listeners.
2156  */
2157 static void __sk_destruct(struct rcu_head *head)
2158 {
2159         struct sock *sk = container_of(head, struct sock, sk_rcu);
2160         struct sk_filter *filter;
2161
2162         if (sk->sk_destruct)
2163                 sk->sk_destruct(sk);
2164
2165         filter = rcu_dereference_check(sk->sk_filter,
2166                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2167         if (filter) {
2168                 sk_filter_uncharge(sk, filter);
2169                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2170         }
2171
2172         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2173
2174 #ifdef CONFIG_BPF_SYSCALL
2175         bpf_sk_storage_free(sk);
2176 #endif
2177
2178         if (atomic_read(&sk->sk_omem_alloc))
2179                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2180                          __func__, atomic_read(&sk->sk_omem_alloc));
2181
2182         if (sk->sk_frag.page) {
2183                 put_page(sk->sk_frag.page);
2184                 sk->sk_frag.page = NULL;
2185         }
2186
2187         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2188         put_cred(sk->sk_peer_cred);
2189         put_pid(sk->sk_peer_pid);
2190
2191         if (likely(sk->sk_net_refcnt))
2192                 put_net_track(sock_net(sk), &sk->ns_tracker);
2193         else
2194                 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2195
2196         sk_prot_free(sk->sk_prot_creator, sk);
2197 }
2198
2199 void sk_destruct(struct sock *sk)
2200 {
2201         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2202
2203         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2204                 reuseport_detach_sock(sk);
2205                 use_call_rcu = true;
2206         }
2207
2208         if (use_call_rcu)
2209                 call_rcu(&sk->sk_rcu, __sk_destruct);
2210         else
2211                 __sk_destruct(&sk->sk_rcu);
2212 }
2213
2214 static void __sk_free(struct sock *sk)
2215 {
2216         if (likely(sk->sk_net_refcnt))
2217                 sock_inuse_add(sock_net(sk), -1);
2218
2219         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2220                 sock_diag_broadcast_destroy(sk);
2221         else
2222                 sk_destruct(sk);
2223 }
2224
2225 void sk_free(struct sock *sk)
2226 {
2227         /*
2228          * We subtract one from sk_wmem_alloc and can know if
2229          * some packets are still in some tx queue.
2230          * If not null, sock_wfree() will call __sk_free(sk) later
2231          */
2232         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2233                 __sk_free(sk);
2234 }
2235 EXPORT_SYMBOL(sk_free);
2236
2237 static void sk_init_common(struct sock *sk)
2238 {
2239         skb_queue_head_init(&sk->sk_receive_queue);
2240         skb_queue_head_init(&sk->sk_write_queue);
2241         skb_queue_head_init(&sk->sk_error_queue);
2242
2243         rwlock_init(&sk->sk_callback_lock);
2244         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2245                         af_rlock_keys + sk->sk_family,
2246                         af_family_rlock_key_strings[sk->sk_family]);
2247         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2248                         af_wlock_keys + sk->sk_family,
2249                         af_family_wlock_key_strings[sk->sk_family]);
2250         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2251                         af_elock_keys + sk->sk_family,
2252                         af_family_elock_key_strings[sk->sk_family]);
2253         lockdep_set_class_and_name(&sk->sk_callback_lock,
2254                         af_callback_keys + sk->sk_family,
2255                         af_family_clock_key_strings[sk->sk_family]);
2256 }
2257
2258 /**
2259  *      sk_clone_lock - clone a socket, and lock its clone
2260  *      @sk: the socket to clone
2261  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2262  *
2263  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2264  */
2265 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2266 {
2267         struct proto *prot = READ_ONCE(sk->sk_prot);
2268         struct sk_filter *filter;
2269         bool is_charged = true;
2270         struct sock *newsk;
2271
2272         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2273         if (!newsk)
2274                 goto out;
2275
2276         sock_copy(newsk, sk);
2277
2278         newsk->sk_prot_creator = prot;
2279
2280         /* SANITY */
2281         if (likely(newsk->sk_net_refcnt)) {
2282                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2283                 sock_inuse_add(sock_net(newsk), 1);
2284         } else {
2285                 /* Kernel sockets are not elevating the struct net refcount.
2286                  * Instead, use a tracker to more easily detect if a layer
2287                  * is not properly dismantling its kernel sockets at netns
2288                  * destroy time.
2289                  */
2290                 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2291                                       false, priority);
2292         }
2293         sk_node_init(&newsk->sk_node);
2294         sock_lock_init(newsk);
2295         bh_lock_sock(newsk);
2296         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2297         newsk->sk_backlog.len = 0;
2298
2299         atomic_set(&newsk->sk_rmem_alloc, 0);
2300
2301         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2302         refcount_set(&newsk->sk_wmem_alloc, 1);
2303
2304         atomic_set(&newsk->sk_omem_alloc, 0);
2305         sk_init_common(newsk);
2306
2307         newsk->sk_dst_cache     = NULL;
2308         newsk->sk_dst_pending_confirm = 0;
2309         newsk->sk_wmem_queued   = 0;
2310         newsk->sk_forward_alloc = 0;
2311         newsk->sk_reserved_mem  = 0;
2312         atomic_set(&newsk->sk_drops, 0);
2313         newsk->sk_send_head     = NULL;
2314         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2315         atomic_set(&newsk->sk_zckey, 0);
2316
2317         sock_reset_flag(newsk, SOCK_DONE);
2318
2319         /* sk->sk_memcg will be populated at accept() time */
2320         newsk->sk_memcg = NULL;
2321
2322         cgroup_sk_clone(&newsk->sk_cgrp_data);
2323
2324         rcu_read_lock();
2325         filter = rcu_dereference(sk->sk_filter);
2326         if (filter != NULL)
2327                 /* though it's an empty new sock, the charging may fail
2328                  * if sysctl_optmem_max was changed between creation of
2329                  * original socket and cloning
2330                  */
2331                 is_charged = sk_filter_charge(newsk, filter);
2332         RCU_INIT_POINTER(newsk->sk_filter, filter);
2333         rcu_read_unlock();
2334
2335         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2336                 /* We need to make sure that we don't uncharge the new
2337                  * socket if we couldn't charge it in the first place
2338                  * as otherwise we uncharge the parent's filter.
2339                  */
2340                 if (!is_charged)
2341                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2342                 sk_free_unlock_clone(newsk);
2343                 newsk = NULL;
2344                 goto out;
2345         }
2346         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2347
2348         if (bpf_sk_storage_clone(sk, newsk)) {
2349                 sk_free_unlock_clone(newsk);
2350                 newsk = NULL;
2351                 goto out;
2352         }
2353
2354         /* Clear sk_user_data if parent had the pointer tagged
2355          * as not suitable for copying when cloning.
2356          */
2357         if (sk_user_data_is_nocopy(newsk))
2358                 newsk->sk_user_data = NULL;
2359
2360         newsk->sk_err      = 0;
2361         newsk->sk_err_soft = 0;
2362         newsk->sk_priority = 0;
2363         newsk->sk_incoming_cpu = raw_smp_processor_id();
2364
2365         /* Before updating sk_refcnt, we must commit prior changes to memory
2366          * (Documentation/RCU/rculist_nulls.rst for details)
2367          */
2368         smp_wmb();
2369         refcount_set(&newsk->sk_refcnt, 2);
2370
2371         sk_set_socket(newsk, NULL);
2372         sk_tx_queue_clear(newsk);
2373         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2374
2375         if (newsk->sk_prot->sockets_allocated)
2376                 sk_sockets_allocated_inc(newsk);
2377
2378         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2379                 net_enable_timestamp();
2380 out:
2381         return newsk;
2382 }
2383 EXPORT_SYMBOL_GPL(sk_clone_lock);
2384
2385 void sk_free_unlock_clone(struct sock *sk)
2386 {
2387         /* It is still raw copy of parent, so invalidate
2388          * destructor and make plain sk_free() */
2389         sk->sk_destruct = NULL;
2390         bh_unlock_sock(sk);
2391         sk_free(sk);
2392 }
2393 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2394
2395 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2396 {
2397         bool is_ipv6 = false;
2398         u32 max_size;
2399
2400 #if IS_ENABLED(CONFIG_IPV6)
2401         is_ipv6 = (sk->sk_family == AF_INET6 &&
2402                    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2403 #endif
2404         /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2405         max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2406                         READ_ONCE(dst->dev->gso_ipv4_max_size);
2407         if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2408                 max_size = GSO_LEGACY_MAX_SIZE;
2409
2410         return max_size - (MAX_TCP_HEADER + 1);
2411 }
2412
2413 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2414 {
2415         u32 max_segs = 1;
2416
2417         sk->sk_route_caps = dst->dev->features;
2418         if (sk_is_tcp(sk))
2419                 sk->sk_route_caps |= NETIF_F_GSO;
2420         if (sk->sk_route_caps & NETIF_F_GSO)
2421                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2422         if (unlikely(sk->sk_gso_disabled))
2423                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2424         if (sk_can_gso(sk)) {
2425                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2426                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2427                 } else {
2428                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2429                         sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2430                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2431                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2432                 }
2433         }
2434         sk->sk_gso_max_segs = max_segs;
2435         sk_dst_set(sk, dst);
2436 }
2437 EXPORT_SYMBOL_GPL(sk_setup_caps);
2438
2439 /*
2440  *      Simple resource managers for sockets.
2441  */
2442
2443
2444 /*
2445  * Write buffer destructor automatically called from kfree_skb.
2446  */
2447 void sock_wfree(struct sk_buff *skb)
2448 {
2449         struct sock *sk = skb->sk;
2450         unsigned int len = skb->truesize;
2451         bool free;
2452
2453         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2454                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2455                     sk->sk_write_space == sock_def_write_space) {
2456                         rcu_read_lock();
2457                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2458                         sock_def_write_space_wfree(sk);
2459                         rcu_read_unlock();
2460                         if (unlikely(free))
2461                                 __sk_free(sk);
2462                         return;
2463                 }
2464
2465                 /*
2466                  * Keep a reference on sk_wmem_alloc, this will be released
2467                  * after sk_write_space() call
2468                  */
2469                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2470                 sk->sk_write_space(sk);
2471                 len = 1;
2472         }
2473         /*
2474          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2475          * could not do because of in-flight packets
2476          */
2477         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2478                 __sk_free(sk);
2479 }
2480 EXPORT_SYMBOL(sock_wfree);
2481
2482 /* This variant of sock_wfree() is used by TCP,
2483  * since it sets SOCK_USE_WRITE_QUEUE.
2484  */
2485 void __sock_wfree(struct sk_buff *skb)
2486 {
2487         struct sock *sk = skb->sk;
2488
2489         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2490                 __sk_free(sk);
2491 }
2492
2493 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2494 {
2495         skb_orphan(skb);
2496         skb->sk = sk;
2497 #ifdef CONFIG_INET
2498         if (unlikely(!sk_fullsock(sk))) {
2499                 skb->destructor = sock_edemux;
2500                 sock_hold(sk);
2501                 return;
2502         }
2503 #endif
2504         skb->destructor = sock_wfree;
2505         skb_set_hash_from_sk(skb, sk);
2506         /*
2507          * We used to take a refcount on sk, but following operation
2508          * is enough to guarantee sk_free() wont free this sock until
2509          * all in-flight packets are completed
2510          */
2511         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2512 }
2513 EXPORT_SYMBOL(skb_set_owner_w);
2514
2515 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2516 {
2517 #ifdef CONFIG_TLS_DEVICE
2518         /* Drivers depend on in-order delivery for crypto offload,
2519          * partial orphan breaks out-of-order-OK logic.
2520          */
2521         if (skb->decrypted)
2522                 return false;
2523 #endif
2524         return (skb->destructor == sock_wfree ||
2525                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2526 }
2527
2528 /* This helper is used by netem, as it can hold packets in its
2529  * delay queue. We want to allow the owner socket to send more
2530  * packets, as if they were already TX completed by a typical driver.
2531  * But we also want to keep skb->sk set because some packet schedulers
2532  * rely on it (sch_fq for example).
2533  */
2534 void skb_orphan_partial(struct sk_buff *skb)
2535 {
2536         if (skb_is_tcp_pure_ack(skb))
2537                 return;
2538
2539         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2540                 return;
2541
2542         skb_orphan(skb);
2543 }
2544 EXPORT_SYMBOL(skb_orphan_partial);
2545
2546 /*
2547  * Read buffer destructor automatically called from kfree_skb.
2548  */
2549 void sock_rfree(struct sk_buff *skb)
2550 {
2551         struct sock *sk = skb->sk;
2552         unsigned int len = skb->truesize;
2553
2554         atomic_sub(len, &sk->sk_rmem_alloc);
2555         sk_mem_uncharge(sk, len);
2556 }
2557 EXPORT_SYMBOL(sock_rfree);
2558
2559 /*
2560  * Buffer destructor for skbs that are not used directly in read or write
2561  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2562  */
2563 void sock_efree(struct sk_buff *skb)
2564 {
2565         sock_put(skb->sk);
2566 }
2567 EXPORT_SYMBOL(sock_efree);
2568
2569 /* Buffer destructor for prefetch/receive path where reference count may
2570  * not be held, e.g. for listen sockets.
2571  */
2572 #ifdef CONFIG_INET
2573 void sock_pfree(struct sk_buff *skb)
2574 {
2575         if (sk_is_refcounted(skb->sk))
2576                 sock_gen_put(skb->sk);
2577 }
2578 EXPORT_SYMBOL(sock_pfree);
2579 #endif /* CONFIG_INET */
2580
2581 kuid_t sock_i_uid(struct sock *sk)
2582 {
2583         kuid_t uid;
2584
2585         read_lock_bh(&sk->sk_callback_lock);
2586         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2587         read_unlock_bh(&sk->sk_callback_lock);
2588         return uid;
2589 }
2590 EXPORT_SYMBOL(sock_i_uid);
2591
2592 unsigned long __sock_i_ino(struct sock *sk)
2593 {
2594         unsigned long ino;
2595
2596         read_lock(&sk->sk_callback_lock);
2597         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2598         read_unlock(&sk->sk_callback_lock);
2599         return ino;
2600 }
2601 EXPORT_SYMBOL(__sock_i_ino);
2602
2603 unsigned long sock_i_ino(struct sock *sk)
2604 {
2605         unsigned long ino;
2606
2607         local_bh_disable();
2608         ino = __sock_i_ino(sk);
2609         local_bh_enable();
2610         return ino;
2611 }
2612 EXPORT_SYMBOL(sock_i_ino);
2613
2614 /*
2615  * Allocate a skb from the socket's send buffer.
2616  */
2617 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2618                              gfp_t priority)
2619 {
2620         if (force ||
2621             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2622                 struct sk_buff *skb = alloc_skb(size, priority);
2623
2624                 if (skb) {
2625                         skb_set_owner_w(skb, sk);
2626                         return skb;
2627                 }
2628         }
2629         return NULL;
2630 }
2631 EXPORT_SYMBOL(sock_wmalloc);
2632
2633 static void sock_ofree(struct sk_buff *skb)
2634 {
2635         struct sock *sk = skb->sk;
2636
2637         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2638 }
2639
2640 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2641                              gfp_t priority)
2642 {
2643         struct sk_buff *skb;
2644
2645         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2646         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2647             READ_ONCE(sysctl_optmem_max))
2648                 return NULL;
2649
2650         skb = alloc_skb(size, priority);
2651         if (!skb)
2652                 return NULL;
2653
2654         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2655         skb->sk = sk;
2656         skb->destructor = sock_ofree;
2657         return skb;
2658 }
2659
2660 /*
2661  * Allocate a memory block from the socket's option memory buffer.
2662  */
2663 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2664 {
2665         int optmem_max = READ_ONCE(sysctl_optmem_max);
2666
2667         if ((unsigned int)size <= optmem_max &&
2668             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2669                 void *mem;
2670                 /* First do the add, to avoid the race if kmalloc
2671                  * might sleep.
2672                  */
2673                 atomic_add(size, &sk->sk_omem_alloc);
2674                 mem = kmalloc(size, priority);
2675                 if (mem)
2676                         return mem;
2677                 atomic_sub(size, &sk->sk_omem_alloc);
2678         }
2679         return NULL;
2680 }
2681 EXPORT_SYMBOL(sock_kmalloc);
2682
2683 /* Free an option memory block. Note, we actually want the inline
2684  * here as this allows gcc to detect the nullify and fold away the
2685  * condition entirely.
2686  */
2687 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2688                                   const bool nullify)
2689 {
2690         if (WARN_ON_ONCE(!mem))
2691                 return;
2692         if (nullify)
2693                 kfree_sensitive(mem);
2694         else
2695                 kfree(mem);
2696         atomic_sub(size, &sk->sk_omem_alloc);
2697 }
2698
2699 void sock_kfree_s(struct sock *sk, void *mem, int size)
2700 {
2701         __sock_kfree_s(sk, mem, size, false);
2702 }
2703 EXPORT_SYMBOL(sock_kfree_s);
2704
2705 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2706 {
2707         __sock_kfree_s(sk, mem, size, true);
2708 }
2709 EXPORT_SYMBOL(sock_kzfree_s);
2710
2711 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2712    I think, these locks should be removed for datagram sockets.
2713  */
2714 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2715 {
2716         DEFINE_WAIT(wait);
2717
2718         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2719         for (;;) {
2720                 if (!timeo)
2721                         break;
2722                 if (signal_pending(current))
2723                         break;
2724                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2725                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2726                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2727                         break;
2728                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2729                         break;
2730                 if (sk->sk_err)
2731                         break;
2732                 timeo = schedule_timeout(timeo);
2733         }
2734         finish_wait(sk_sleep(sk), &wait);
2735         return timeo;
2736 }
2737
2738
2739 /*
2740  *      Generic send/receive buffer handlers
2741  */
2742
2743 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2744                                      unsigned long data_len, int noblock,
2745                                      int *errcode, int max_page_order)
2746 {
2747         struct sk_buff *skb;
2748         long timeo;
2749         int err;
2750
2751         timeo = sock_sndtimeo(sk, noblock);
2752         for (;;) {
2753                 err = sock_error(sk);
2754                 if (err != 0)
2755                         goto failure;
2756
2757                 err = -EPIPE;
2758                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2759                         goto failure;
2760
2761                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2762                         break;
2763
2764                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2765                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2766                 err = -EAGAIN;
2767                 if (!timeo)
2768                         goto failure;
2769                 if (signal_pending(current))
2770                         goto interrupted;
2771                 timeo = sock_wait_for_wmem(sk, timeo);
2772         }
2773         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2774                                    errcode, sk->sk_allocation);
2775         if (skb)
2776                 skb_set_owner_w(skb, sk);
2777         return skb;
2778
2779 interrupted:
2780         err = sock_intr_errno(timeo);
2781 failure:
2782         *errcode = err;
2783         return NULL;
2784 }
2785 EXPORT_SYMBOL(sock_alloc_send_pskb);
2786
2787 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2788                      struct sockcm_cookie *sockc)
2789 {
2790         u32 tsflags;
2791
2792         switch (cmsg->cmsg_type) {
2793         case SO_MARK:
2794                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2795                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2796                         return -EPERM;
2797                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2798                         return -EINVAL;
2799                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2800                 break;
2801         case SO_TIMESTAMPING_OLD:
2802                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2803                         return -EINVAL;
2804
2805                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2806                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2807                         return -EINVAL;
2808
2809                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2810                 sockc->tsflags |= tsflags;
2811                 break;
2812         case SCM_TXTIME:
2813                 if (!sock_flag(sk, SOCK_TXTIME))
2814                         return -EINVAL;
2815                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2816                         return -EINVAL;
2817                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2818                 break;
2819         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2820         case SCM_RIGHTS:
2821         case SCM_CREDENTIALS:
2822                 break;
2823         default:
2824                 return -EINVAL;
2825         }
2826         return 0;
2827 }
2828 EXPORT_SYMBOL(__sock_cmsg_send);
2829
2830 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2831                    struct sockcm_cookie *sockc)
2832 {
2833         struct cmsghdr *cmsg;
2834         int ret;
2835
2836         for_each_cmsghdr(cmsg, msg) {
2837                 if (!CMSG_OK(msg, cmsg))
2838                         return -EINVAL;
2839                 if (cmsg->cmsg_level != SOL_SOCKET)
2840                         continue;
2841                 ret = __sock_cmsg_send(sk, cmsg, sockc);
2842                 if (ret)
2843                         return ret;
2844         }
2845         return 0;
2846 }
2847 EXPORT_SYMBOL(sock_cmsg_send);
2848
2849 static void sk_enter_memory_pressure(struct sock *sk)
2850 {
2851         if (!sk->sk_prot->enter_memory_pressure)
2852                 return;
2853
2854         sk->sk_prot->enter_memory_pressure(sk);
2855 }
2856
2857 static void sk_leave_memory_pressure(struct sock *sk)
2858 {
2859         if (sk->sk_prot->leave_memory_pressure) {
2860                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2861                                      tcp_leave_memory_pressure, sk);
2862         } else {
2863                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2864
2865                 if (memory_pressure && READ_ONCE(*memory_pressure))
2866                         WRITE_ONCE(*memory_pressure, 0);
2867         }
2868 }
2869
2870 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2871
2872 /**
2873  * skb_page_frag_refill - check that a page_frag contains enough room
2874  * @sz: minimum size of the fragment we want to get
2875  * @pfrag: pointer to page_frag
2876  * @gfp: priority for memory allocation
2877  *
2878  * Note: While this allocator tries to use high order pages, there is
2879  * no guarantee that allocations succeed. Therefore, @sz MUST be
2880  * less or equal than PAGE_SIZE.
2881  */
2882 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2883 {
2884         if (pfrag->page) {
2885                 if (page_ref_count(pfrag->page) == 1) {
2886                         pfrag->offset = 0;
2887                         return true;
2888                 }
2889                 if (pfrag->offset + sz <= pfrag->size)
2890                         return true;
2891                 put_page(pfrag->page);
2892         }
2893
2894         pfrag->offset = 0;
2895         if (SKB_FRAG_PAGE_ORDER &&
2896             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2897                 /* Avoid direct reclaim but allow kswapd to wake */
2898                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2899                                           __GFP_COMP | __GFP_NOWARN |
2900                                           __GFP_NORETRY,
2901                                           SKB_FRAG_PAGE_ORDER);
2902                 if (likely(pfrag->page)) {
2903                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2904                         return true;
2905                 }
2906         }
2907         pfrag->page = alloc_page(gfp);
2908         if (likely(pfrag->page)) {
2909                 pfrag->size = PAGE_SIZE;
2910                 return true;
2911         }
2912         return false;
2913 }
2914 EXPORT_SYMBOL(skb_page_frag_refill);
2915
2916 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2917 {
2918         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2919                 return true;
2920
2921         sk_enter_memory_pressure(sk);
2922         sk_stream_moderate_sndbuf(sk);
2923         return false;
2924 }
2925 EXPORT_SYMBOL(sk_page_frag_refill);
2926
2927 void __lock_sock(struct sock *sk)
2928         __releases(&sk->sk_lock.slock)
2929         __acquires(&sk->sk_lock.slock)
2930 {
2931         DEFINE_WAIT(wait);
2932
2933         for (;;) {
2934                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2935                                         TASK_UNINTERRUPTIBLE);
2936                 spin_unlock_bh(&sk->sk_lock.slock);
2937                 schedule();
2938                 spin_lock_bh(&sk->sk_lock.slock);
2939                 if (!sock_owned_by_user(sk))
2940                         break;
2941         }
2942         finish_wait(&sk->sk_lock.wq, &wait);
2943 }
2944
2945 void __release_sock(struct sock *sk)
2946         __releases(&sk->sk_lock.slock)
2947         __acquires(&sk->sk_lock.slock)
2948 {
2949         struct sk_buff *skb, *next;
2950
2951         while ((skb = sk->sk_backlog.head) != NULL) {
2952                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2953
2954                 spin_unlock_bh(&sk->sk_lock.slock);
2955
2956                 do {
2957                         next = skb->next;
2958                         prefetch(next);
2959                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2960                         skb_mark_not_on_list(skb);
2961                         sk_backlog_rcv(sk, skb);
2962
2963                         cond_resched();
2964
2965                         skb = next;
2966                 } while (skb != NULL);
2967
2968                 spin_lock_bh(&sk->sk_lock.slock);
2969         }
2970
2971         /*
2972          * Doing the zeroing here guarantee we can not loop forever
2973          * while a wild producer attempts to flood us.
2974          */
2975         sk->sk_backlog.len = 0;
2976 }
2977
2978 void __sk_flush_backlog(struct sock *sk)
2979 {
2980         spin_lock_bh(&sk->sk_lock.slock);
2981         __release_sock(sk);
2982         spin_unlock_bh(&sk->sk_lock.slock);
2983 }
2984 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2985
2986 /**
2987  * sk_wait_data - wait for data to arrive at sk_receive_queue
2988  * @sk:    sock to wait on
2989  * @timeo: for how long
2990  * @skb:   last skb seen on sk_receive_queue
2991  *
2992  * Now socket state including sk->sk_err is changed only under lock,
2993  * hence we may omit checks after joining wait queue.
2994  * We check receive queue before schedule() only as optimization;
2995  * it is very likely that release_sock() added new data.
2996  */
2997 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2998 {
2999         DEFINE_WAIT_FUNC(wait, woken_wake_function);
3000         int rc;
3001
3002         add_wait_queue(sk_sleep(sk), &wait);
3003         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3004         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3005         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3006         remove_wait_queue(sk_sleep(sk), &wait);
3007         return rc;
3008 }
3009 EXPORT_SYMBOL(sk_wait_data);
3010
3011 /**
3012  *      __sk_mem_raise_allocated - increase memory_allocated
3013  *      @sk: socket
3014  *      @size: memory size to allocate
3015  *      @amt: pages to allocate
3016  *      @kind: allocation type
3017  *
3018  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3019  */
3020 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3021 {
3022         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3023         struct proto *prot = sk->sk_prot;
3024         bool charged = true;
3025         long allocated;
3026
3027         sk_memory_allocated_add(sk, amt);
3028         allocated = sk_memory_allocated(sk);
3029         if (memcg_charge &&
3030             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3031                                                 gfp_memcg_charge())))
3032                 goto suppress_allocation;
3033
3034         /* Under limit. */
3035         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3036                 sk_leave_memory_pressure(sk);
3037                 return 1;
3038         }
3039
3040         /* Under pressure. */
3041         if (allocated > sk_prot_mem_limits(sk, 1))
3042                 sk_enter_memory_pressure(sk);
3043
3044         /* Over hard limit. */
3045         if (allocated > sk_prot_mem_limits(sk, 2))
3046                 goto suppress_allocation;
3047
3048         /* guarantee minimum buffer size under pressure */
3049         if (kind == SK_MEM_RECV) {
3050                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3051                         return 1;
3052
3053         } else { /* SK_MEM_SEND */
3054                 int wmem0 = sk_get_wmem0(sk, prot);
3055
3056                 if (sk->sk_type == SOCK_STREAM) {
3057                         if (sk->sk_wmem_queued < wmem0)
3058                                 return 1;
3059                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3060                                 return 1;
3061                 }
3062         }
3063
3064         if (sk_has_memory_pressure(sk)) {
3065                 u64 alloc;
3066
3067                 if (!sk_under_memory_pressure(sk))
3068                         return 1;
3069                 alloc = sk_sockets_allocated_read_positive(sk);
3070                 if (sk_prot_mem_limits(sk, 2) > alloc *
3071                     sk_mem_pages(sk->sk_wmem_queued +
3072                                  atomic_read(&sk->sk_rmem_alloc) +
3073                                  sk->sk_forward_alloc))
3074                         return 1;
3075         }
3076
3077 suppress_allocation:
3078
3079         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3080                 sk_stream_moderate_sndbuf(sk);
3081
3082                 /* Fail only if socket is _under_ its sndbuf.
3083                  * In this case we cannot block, so that we have to fail.
3084                  */
3085                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3086                         /* Force charge with __GFP_NOFAIL */
3087                         if (memcg_charge && !charged) {
3088                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3089                                         gfp_memcg_charge() | __GFP_NOFAIL);
3090                         }
3091                         return 1;
3092                 }
3093         }
3094
3095         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3096                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3097
3098         sk_memory_allocated_sub(sk, amt);
3099
3100         if (memcg_charge && charged)
3101                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3102
3103         return 0;
3104 }
3105
3106 /**
3107  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3108  *      @sk: socket
3109  *      @size: memory size to allocate
3110  *      @kind: allocation type
3111  *
3112  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3113  *      rmem allocation. This function assumes that protocols which have
3114  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3115  */
3116 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3117 {
3118         int ret, amt = sk_mem_pages(size);
3119
3120         sk->sk_forward_alloc += amt << PAGE_SHIFT;
3121         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3122         if (!ret)
3123                 sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3124         return ret;
3125 }
3126 EXPORT_SYMBOL(__sk_mem_schedule);
3127
3128 /**
3129  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3130  *      @sk: socket
3131  *      @amount: number of quanta
3132  *
3133  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3134  */
3135 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3136 {
3137         sk_memory_allocated_sub(sk, amount);
3138
3139         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3140                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3141
3142         if (sk_under_memory_pressure(sk) &&
3143             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3144                 sk_leave_memory_pressure(sk);
3145 }
3146
3147 /**
3148  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3149  *      @sk: socket
3150  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3151  */
3152 void __sk_mem_reclaim(struct sock *sk, int amount)
3153 {
3154         amount >>= PAGE_SHIFT;
3155         sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3156         __sk_mem_reduce_allocated(sk, amount);
3157 }
3158 EXPORT_SYMBOL(__sk_mem_reclaim);
3159
3160 int sk_set_peek_off(struct sock *sk, int val)
3161 {
3162         sk->sk_peek_off = val;
3163         return 0;
3164 }
3165 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3166
3167 /*
3168  * Set of default routines for initialising struct proto_ops when
3169  * the protocol does not support a particular function. In certain
3170  * cases where it makes no sense for a protocol to have a "do nothing"
3171  * function, some default processing is provided.
3172  */
3173
3174 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3175 {
3176         return -EOPNOTSUPP;
3177 }
3178 EXPORT_SYMBOL(sock_no_bind);
3179
3180 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3181                     int len, int flags)
3182 {
3183         return -EOPNOTSUPP;
3184 }
3185 EXPORT_SYMBOL(sock_no_connect);
3186
3187 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3188 {
3189         return -EOPNOTSUPP;
3190 }
3191 EXPORT_SYMBOL(sock_no_socketpair);
3192
3193 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3194                    bool kern)
3195 {
3196         return -EOPNOTSUPP;
3197 }
3198 EXPORT_SYMBOL(sock_no_accept);
3199
3200 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3201                     int peer)
3202 {
3203         return -EOPNOTSUPP;
3204 }
3205 EXPORT_SYMBOL(sock_no_getname);
3206
3207 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3208 {
3209         return -EOPNOTSUPP;
3210 }
3211 EXPORT_SYMBOL(sock_no_ioctl);
3212
3213 int sock_no_listen(struct socket *sock, int backlog)
3214 {
3215         return -EOPNOTSUPP;
3216 }
3217 EXPORT_SYMBOL(sock_no_listen);
3218
3219 int sock_no_shutdown(struct socket *sock, int how)
3220 {
3221         return -EOPNOTSUPP;
3222 }
3223 EXPORT_SYMBOL(sock_no_shutdown);
3224
3225 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3226 {
3227         return -EOPNOTSUPP;
3228 }
3229 EXPORT_SYMBOL(sock_no_sendmsg);
3230
3231 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3232 {
3233         return -EOPNOTSUPP;
3234 }
3235 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3236
3237 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3238                     int flags)
3239 {
3240         return -EOPNOTSUPP;
3241 }
3242 EXPORT_SYMBOL(sock_no_recvmsg);
3243
3244 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3245 {
3246         /* Mirror missing mmap method error code */
3247         return -ENODEV;
3248 }
3249 EXPORT_SYMBOL(sock_no_mmap);
3250
3251 /*
3252  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3253  * various sock-based usage counts.
3254  */
3255 void __receive_sock(struct file *file)
3256 {
3257         struct socket *sock;
3258
3259         sock = sock_from_file(file);
3260         if (sock) {
3261                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3262                 sock_update_classid(&sock->sk->sk_cgrp_data);
3263         }
3264 }
3265
3266 /*
3267  *      Default Socket Callbacks
3268  */
3269
3270 static void sock_def_wakeup(struct sock *sk)
3271 {
3272         struct socket_wq *wq;
3273
3274         rcu_read_lock();
3275         wq = rcu_dereference(sk->sk_wq);
3276         if (skwq_has_sleeper(wq))
3277                 wake_up_interruptible_all(&wq->wait);
3278         rcu_read_unlock();
3279 }
3280
3281 static void sock_def_error_report(struct sock *sk)
3282 {
3283         struct socket_wq *wq;
3284
3285         rcu_read_lock();
3286         wq = rcu_dereference(sk->sk_wq);
3287         if (skwq_has_sleeper(wq))
3288                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3289         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3290         rcu_read_unlock();
3291 }
3292
3293 void sock_def_readable(struct sock *sk)
3294 {
3295         struct socket_wq *wq;
3296
3297         trace_sk_data_ready(sk);
3298
3299         rcu_read_lock();
3300         wq = rcu_dereference(sk->sk_wq);
3301         if (skwq_has_sleeper(wq))
3302                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3303                                                 EPOLLRDNORM | EPOLLRDBAND);
3304         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3305         rcu_read_unlock();
3306 }
3307
3308 static void sock_def_write_space(struct sock *sk)
3309 {
3310         struct socket_wq *wq;
3311
3312         rcu_read_lock();
3313
3314         /* Do not wake up a writer until he can make "significant"
3315          * progress.  --DaveM
3316          */
3317         if (sock_writeable(sk)) {
3318                 wq = rcu_dereference(sk->sk_wq);
3319                 if (skwq_has_sleeper(wq))
3320                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3321                                                 EPOLLWRNORM | EPOLLWRBAND);
3322
3323                 /* Should agree with poll, otherwise some programs break */
3324                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3325         }
3326
3327         rcu_read_unlock();
3328 }
3329
3330 /* An optimised version of sock_def_write_space(), should only be called
3331  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3332  * ->sk_wmem_alloc.
3333  */
3334 static void sock_def_write_space_wfree(struct sock *sk)
3335 {
3336         /* Do not wake up a writer until he can make "significant"
3337          * progress.  --DaveM
3338          */
3339         if (sock_writeable(sk)) {
3340                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3341
3342                 /* rely on refcount_sub from sock_wfree() */
3343                 smp_mb__after_atomic();
3344                 if (wq && waitqueue_active(&wq->wait))
3345                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3346                                                 EPOLLWRNORM | EPOLLWRBAND);
3347
3348                 /* Should agree with poll, otherwise some programs break */
3349                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3350         }
3351 }
3352
3353 static void sock_def_destruct(struct sock *sk)
3354 {
3355 }
3356
3357 void sk_send_sigurg(struct sock *sk)
3358 {
3359         if (sk->sk_socket && sk->sk_socket->file)
3360                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3361                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3362 }
3363 EXPORT_SYMBOL(sk_send_sigurg);
3364
3365 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3366                     unsigned long expires)
3367 {
3368         if (!mod_timer(timer, expires))
3369                 sock_hold(sk);
3370 }
3371 EXPORT_SYMBOL(sk_reset_timer);
3372
3373 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3374 {
3375         if (del_timer(timer))
3376                 __sock_put(sk);
3377 }
3378 EXPORT_SYMBOL(sk_stop_timer);
3379
3380 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3381 {
3382         if (del_timer_sync(timer))
3383                 __sock_put(sk);
3384 }
3385 EXPORT_SYMBOL(sk_stop_timer_sync);
3386
3387 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3388 {
3389         sk_init_common(sk);
3390         sk->sk_send_head        =       NULL;
3391
3392         timer_setup(&sk->sk_timer, NULL, 0);
3393
3394         sk->sk_allocation       =       GFP_KERNEL;
3395         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3396         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3397         sk->sk_state            =       TCP_CLOSE;
3398         sk->sk_use_task_frag    =       true;
3399         sk_set_socket(sk, sock);
3400
3401         sock_set_flag(sk, SOCK_ZAPPED);
3402
3403         if (sock) {
3404                 sk->sk_type     =       sock->type;
3405                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3406                 sock->sk        =       sk;
3407         } else {
3408                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3409         }
3410         sk->sk_uid      =       uid;
3411
3412         rwlock_init(&sk->sk_callback_lock);
3413         if (sk->sk_kern_sock)
3414                 lockdep_set_class_and_name(
3415                         &sk->sk_callback_lock,
3416                         af_kern_callback_keys + sk->sk_family,
3417                         af_family_kern_clock_key_strings[sk->sk_family]);
3418         else
3419                 lockdep_set_class_and_name(
3420                         &sk->sk_callback_lock,
3421                         af_callback_keys + sk->sk_family,
3422                         af_family_clock_key_strings[sk->sk_family]);
3423
3424         sk->sk_state_change     =       sock_def_wakeup;
3425         sk->sk_data_ready       =       sock_def_readable;
3426         sk->sk_write_space      =       sock_def_write_space;
3427         sk->sk_error_report     =       sock_def_error_report;
3428         sk->sk_destruct         =       sock_def_destruct;
3429
3430         sk->sk_frag.page        =       NULL;
3431         sk->sk_frag.offset      =       0;
3432         sk->sk_peek_off         =       -1;
3433
3434         sk->sk_peer_pid         =       NULL;
3435         sk->sk_peer_cred        =       NULL;
3436         spin_lock_init(&sk->sk_peer_lock);
3437
3438         sk->sk_write_pending    =       0;
3439         sk->sk_rcvlowat         =       1;
3440         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3441         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3442
3443         sk->sk_stamp = SK_DEFAULT_STAMP;
3444 #if BITS_PER_LONG==32
3445         seqlock_init(&sk->sk_stamp_seq);
3446 #endif
3447         atomic_set(&sk->sk_zckey, 0);
3448
3449 #ifdef CONFIG_NET_RX_BUSY_POLL
3450         sk->sk_napi_id          =       0;
3451         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3452 #endif
3453
3454         sk->sk_max_pacing_rate = ~0UL;
3455         sk->sk_pacing_rate = ~0UL;
3456         WRITE_ONCE(sk->sk_pacing_shift, 10);
3457         sk->sk_incoming_cpu = -1;
3458
3459         sk_rx_queue_clear(sk);
3460         /*
3461          * Before updating sk_refcnt, we must commit prior changes to memory
3462          * (Documentation/RCU/rculist_nulls.rst for details)
3463          */
3464         smp_wmb();
3465         refcount_set(&sk->sk_refcnt, 1);
3466         atomic_set(&sk->sk_drops, 0);
3467 }
3468 EXPORT_SYMBOL(sock_init_data_uid);
3469
3470 void sock_init_data(struct socket *sock, struct sock *sk)
3471 {
3472         kuid_t uid = sock ?
3473                 SOCK_INODE(sock)->i_uid :
3474                 make_kuid(sock_net(sk)->user_ns, 0);
3475
3476         sock_init_data_uid(sock, sk, uid);
3477 }
3478 EXPORT_SYMBOL(sock_init_data);
3479
3480 void lock_sock_nested(struct sock *sk, int subclass)
3481 {
3482         /* The sk_lock has mutex_lock() semantics here. */
3483         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3484
3485         might_sleep();
3486         spin_lock_bh(&sk->sk_lock.slock);
3487         if (sock_owned_by_user_nocheck(sk))
3488                 __lock_sock(sk);
3489         sk->sk_lock.owned = 1;
3490         spin_unlock_bh(&sk->sk_lock.slock);
3491 }
3492 EXPORT_SYMBOL(lock_sock_nested);
3493
3494 void release_sock(struct sock *sk)
3495 {
3496         spin_lock_bh(&sk->sk_lock.slock);
3497         if (sk->sk_backlog.tail)
3498                 __release_sock(sk);
3499
3500         /* Warning : release_cb() might need to release sk ownership,
3501          * ie call sock_release_ownership(sk) before us.
3502          */
3503         if (sk->sk_prot->release_cb)
3504                 sk->sk_prot->release_cb(sk);
3505
3506         sock_release_ownership(sk);
3507         if (waitqueue_active(&sk->sk_lock.wq))
3508                 wake_up(&sk->sk_lock.wq);
3509         spin_unlock_bh(&sk->sk_lock.slock);
3510 }
3511 EXPORT_SYMBOL(release_sock);
3512
3513 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3514 {
3515         might_sleep();
3516         spin_lock_bh(&sk->sk_lock.slock);
3517
3518         if (!sock_owned_by_user_nocheck(sk)) {
3519                 /*
3520                  * Fast path return with bottom halves disabled and
3521                  * sock::sk_lock.slock held.
3522                  *
3523                  * The 'mutex' is not contended and holding
3524                  * sock::sk_lock.slock prevents all other lockers to
3525                  * proceed so the corresponding unlock_sock_fast() can
3526                  * avoid the slow path of release_sock() completely and
3527                  * just release slock.
3528                  *
3529                  * From a semantical POV this is equivalent to 'acquiring'
3530                  * the 'mutex', hence the corresponding lockdep
3531                  * mutex_release() has to happen in the fast path of
3532                  * unlock_sock_fast().
3533                  */
3534                 return false;
3535         }
3536
3537         __lock_sock(sk);
3538         sk->sk_lock.owned = 1;
3539         __acquire(&sk->sk_lock.slock);
3540         spin_unlock_bh(&sk->sk_lock.slock);
3541         return true;
3542 }
3543 EXPORT_SYMBOL(__lock_sock_fast);
3544
3545 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3546                    bool timeval, bool time32)
3547 {
3548         struct sock *sk = sock->sk;
3549         struct timespec64 ts;
3550
3551         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3552         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3553         if (ts.tv_sec == -1)
3554                 return -ENOENT;
3555         if (ts.tv_sec == 0) {
3556                 ktime_t kt = ktime_get_real();
3557                 sock_write_timestamp(sk, kt);
3558                 ts = ktime_to_timespec64(kt);
3559         }
3560
3561         if (timeval)
3562                 ts.tv_nsec /= 1000;
3563
3564 #ifdef CONFIG_COMPAT_32BIT_TIME
3565         if (time32)
3566                 return put_old_timespec32(&ts, userstamp);
3567 #endif
3568 #ifdef CONFIG_SPARC64
3569         /* beware of padding in sparc64 timeval */
3570         if (timeval && !in_compat_syscall()) {
3571                 struct __kernel_old_timeval __user tv = {
3572                         .tv_sec = ts.tv_sec,
3573                         .tv_usec = ts.tv_nsec,
3574                 };
3575                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3576                         return -EFAULT;
3577                 return 0;
3578         }
3579 #endif
3580         return put_timespec64(&ts, userstamp);
3581 }
3582 EXPORT_SYMBOL(sock_gettstamp);
3583
3584 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3585 {
3586         if (!sock_flag(sk, flag)) {
3587                 unsigned long previous_flags = sk->sk_flags;
3588
3589                 sock_set_flag(sk, flag);
3590                 /*
3591                  * we just set one of the two flags which require net
3592                  * time stamping, but time stamping might have been on
3593                  * already because of the other one
3594                  */
3595                 if (sock_needs_netstamp(sk) &&
3596                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3597                         net_enable_timestamp();
3598         }
3599 }
3600
3601 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3602                        int level, int type)
3603 {
3604         struct sock_exterr_skb *serr;
3605         struct sk_buff *skb;
3606         int copied, err;
3607
3608         err = -EAGAIN;
3609         skb = sock_dequeue_err_skb(sk);
3610         if (skb == NULL)
3611                 goto out;
3612
3613         copied = skb->len;
3614         if (copied > len) {
3615                 msg->msg_flags |= MSG_TRUNC;
3616                 copied = len;
3617         }
3618         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3619         if (err)
3620                 goto out_free_skb;
3621
3622         sock_recv_timestamp(msg, sk, skb);
3623
3624         serr = SKB_EXT_ERR(skb);
3625         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3626
3627         msg->msg_flags |= MSG_ERRQUEUE;
3628         err = copied;
3629
3630 out_free_skb:
3631         kfree_skb(skb);
3632 out:
3633         return err;
3634 }
3635 EXPORT_SYMBOL(sock_recv_errqueue);
3636
3637 /*
3638  *      Get a socket option on an socket.
3639  *
3640  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3641  *      asynchronous errors should be reported by getsockopt. We assume
3642  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3643  */
3644 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3645                            char __user *optval, int __user *optlen)
3646 {
3647         struct sock *sk = sock->sk;
3648
3649         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3650         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3651 }
3652 EXPORT_SYMBOL(sock_common_getsockopt);
3653
3654 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3655                         int flags)
3656 {
3657         struct sock *sk = sock->sk;
3658         int addr_len = 0;
3659         int err;
3660
3661         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3662         if (err >= 0)
3663                 msg->msg_namelen = addr_len;
3664         return err;
3665 }
3666 EXPORT_SYMBOL(sock_common_recvmsg);
3667
3668 /*
3669  *      Set socket options on an inet socket.
3670  */
3671 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3672                            sockptr_t optval, unsigned int optlen)
3673 {
3674         struct sock *sk = sock->sk;
3675
3676         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3677         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3678 }
3679 EXPORT_SYMBOL(sock_common_setsockopt);
3680
3681 void sk_common_release(struct sock *sk)
3682 {
3683         if (sk->sk_prot->destroy)
3684                 sk->sk_prot->destroy(sk);
3685
3686         /*
3687          * Observation: when sk_common_release is called, processes have
3688          * no access to socket. But net still has.
3689          * Step one, detach it from networking:
3690          *
3691          * A. Remove from hash tables.
3692          */
3693
3694         sk->sk_prot->unhash(sk);
3695
3696         /*
3697          * In this point socket cannot receive new packets, but it is possible
3698          * that some packets are in flight because some CPU runs receiver and
3699          * did hash table lookup before we unhashed socket. They will achieve
3700          * receive queue and will be purged by socket destructor.
3701          *
3702          * Also we still have packets pending on receive queue and probably,
3703          * our own packets waiting in device queues. sock_destroy will drain
3704          * receive queue, but transmitted packets will delay socket destruction
3705          * until the last reference will be released.
3706          */
3707
3708         sock_orphan(sk);
3709
3710         xfrm_sk_free_policy(sk);
3711
3712         sock_put(sk);
3713 }
3714 EXPORT_SYMBOL(sk_common_release);
3715
3716 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3717 {
3718         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3719
3720         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3721         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3722         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3723         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3724         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3725         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3726         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3727         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3728         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3729 }
3730
3731 #ifdef CONFIG_PROC_FS
3732 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3733
3734 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3735 {
3736         int cpu, idx = prot->inuse_idx;
3737         int res = 0;
3738
3739         for_each_possible_cpu(cpu)
3740                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3741
3742         return res >= 0 ? res : 0;
3743 }
3744 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3745
3746 int sock_inuse_get(struct net *net)
3747 {
3748         int cpu, res = 0;
3749
3750         for_each_possible_cpu(cpu)
3751                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3752
3753         return res;
3754 }
3755
3756 EXPORT_SYMBOL_GPL(sock_inuse_get);
3757
3758 static int __net_init sock_inuse_init_net(struct net *net)
3759 {
3760         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3761         if (net->core.prot_inuse == NULL)
3762                 return -ENOMEM;
3763         return 0;
3764 }
3765
3766 static void __net_exit sock_inuse_exit_net(struct net *net)
3767 {
3768         free_percpu(net->core.prot_inuse);
3769 }
3770
3771 static struct pernet_operations net_inuse_ops = {
3772         .init = sock_inuse_init_net,
3773         .exit = sock_inuse_exit_net,
3774 };
3775
3776 static __init int net_inuse_init(void)
3777 {
3778         if (register_pernet_subsys(&net_inuse_ops))
3779                 panic("Cannot initialize net inuse counters");
3780
3781         return 0;
3782 }
3783
3784 core_initcall(net_inuse_init);
3785
3786 static int assign_proto_idx(struct proto *prot)
3787 {
3788         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3789
3790         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3791                 pr_err("PROTO_INUSE_NR exhausted\n");
3792                 return -ENOSPC;
3793         }
3794
3795         set_bit(prot->inuse_idx, proto_inuse_idx);
3796         return 0;
3797 }
3798
3799 static void release_proto_idx(struct proto *prot)
3800 {
3801         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3802                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3803 }
3804 #else
3805 static inline int assign_proto_idx(struct proto *prot)
3806 {
3807         return 0;
3808 }
3809
3810 static inline void release_proto_idx(struct proto *prot)
3811 {
3812 }
3813
3814 #endif
3815
3816 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3817 {
3818         if (!twsk_prot)
3819                 return;
3820         kfree(twsk_prot->twsk_slab_name);
3821         twsk_prot->twsk_slab_name = NULL;
3822         kmem_cache_destroy(twsk_prot->twsk_slab);
3823         twsk_prot->twsk_slab = NULL;
3824 }
3825
3826 static int tw_prot_init(const struct proto *prot)
3827 {
3828         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3829
3830         if (!twsk_prot)
3831                 return 0;
3832
3833         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3834                                               prot->name);
3835         if (!twsk_prot->twsk_slab_name)
3836                 return -ENOMEM;
3837
3838         twsk_prot->twsk_slab =
3839                 kmem_cache_create(twsk_prot->twsk_slab_name,
3840                                   twsk_prot->twsk_obj_size, 0,
3841                                   SLAB_ACCOUNT | prot->slab_flags,
3842                                   NULL);
3843         if (!twsk_prot->twsk_slab) {
3844                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3845                         prot->name);
3846                 return -ENOMEM;
3847         }
3848
3849         return 0;
3850 }
3851
3852 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3853 {
3854         if (!rsk_prot)
3855                 return;
3856         kfree(rsk_prot->slab_name);
3857         rsk_prot->slab_name = NULL;
3858         kmem_cache_destroy(rsk_prot->slab);
3859         rsk_prot->slab = NULL;
3860 }
3861
3862 static int req_prot_init(const struct proto *prot)
3863 {
3864         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3865
3866         if (!rsk_prot)
3867                 return 0;
3868
3869         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3870                                         prot->name);
3871         if (!rsk_prot->slab_name)
3872                 return -ENOMEM;
3873
3874         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3875                                            rsk_prot->obj_size, 0,
3876                                            SLAB_ACCOUNT | prot->slab_flags,
3877                                            NULL);
3878
3879         if (!rsk_prot->slab) {
3880                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3881                         prot->name);
3882                 return -ENOMEM;
3883         }
3884         return 0;
3885 }
3886
3887 int proto_register(struct proto *prot, int alloc_slab)
3888 {
3889         int ret = -ENOBUFS;
3890
3891         if (prot->memory_allocated && !prot->sysctl_mem) {
3892                 pr_err("%s: missing sysctl_mem\n", prot->name);
3893                 return -EINVAL;
3894         }
3895         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3896                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3897                 return -EINVAL;
3898         }
3899         if (alloc_slab) {
3900                 prot->slab = kmem_cache_create_usercopy(prot->name,
3901                                         prot->obj_size, 0,
3902                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3903                                         prot->slab_flags,
3904                                         prot->useroffset, prot->usersize,
3905                                         NULL);
3906
3907                 if (prot->slab == NULL) {
3908                         pr_crit("%s: Can't create sock SLAB cache!\n",
3909                                 prot->name);
3910                         goto out;
3911                 }
3912
3913                 if (req_prot_init(prot))
3914                         goto out_free_request_sock_slab;
3915
3916                 if (tw_prot_init(prot))
3917                         goto out_free_timewait_sock_slab;
3918         }
3919
3920         mutex_lock(&proto_list_mutex);
3921         ret = assign_proto_idx(prot);
3922         if (ret) {
3923                 mutex_unlock(&proto_list_mutex);
3924                 goto out_free_timewait_sock_slab;
3925         }
3926         list_add(&prot->node, &proto_list);
3927         mutex_unlock(&proto_list_mutex);
3928         return ret;
3929
3930 out_free_timewait_sock_slab:
3931         if (alloc_slab)
3932                 tw_prot_cleanup(prot->twsk_prot);
3933 out_free_request_sock_slab:
3934         if (alloc_slab) {
3935                 req_prot_cleanup(prot->rsk_prot);
3936
3937                 kmem_cache_destroy(prot->slab);
3938                 prot->slab = NULL;
3939         }
3940 out:
3941         return ret;
3942 }
3943 EXPORT_SYMBOL(proto_register);
3944
3945 void proto_unregister(struct proto *prot)
3946 {
3947         mutex_lock(&proto_list_mutex);
3948         release_proto_idx(prot);
3949         list_del(&prot->node);
3950         mutex_unlock(&proto_list_mutex);
3951
3952         kmem_cache_destroy(prot->slab);
3953         prot->slab = NULL;
3954
3955         req_prot_cleanup(prot->rsk_prot);
3956         tw_prot_cleanup(prot->twsk_prot);
3957 }
3958 EXPORT_SYMBOL(proto_unregister);
3959
3960 int sock_load_diag_module(int family, int protocol)
3961 {
3962         if (!protocol) {
3963                 if (!sock_is_registered(family))
3964                         return -ENOENT;
3965
3966                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3967                                       NETLINK_SOCK_DIAG, family);
3968         }
3969
3970 #ifdef CONFIG_INET
3971         if (family == AF_INET &&
3972             protocol != IPPROTO_RAW &&
3973             protocol < MAX_INET_PROTOS &&
3974             !rcu_access_pointer(inet_protos[protocol]))
3975                 return -ENOENT;
3976 #endif
3977
3978         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3979                               NETLINK_SOCK_DIAG, family, protocol);
3980 }
3981 EXPORT_SYMBOL(sock_load_diag_module);
3982
3983 #ifdef CONFIG_PROC_FS
3984 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3985         __acquires(proto_list_mutex)
3986 {
3987         mutex_lock(&proto_list_mutex);
3988         return seq_list_start_head(&proto_list, *pos);
3989 }
3990
3991 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3992 {
3993         return seq_list_next(v, &proto_list, pos);
3994 }
3995
3996 static void proto_seq_stop(struct seq_file *seq, void *v)
3997         __releases(proto_list_mutex)
3998 {
3999         mutex_unlock(&proto_list_mutex);
4000 }
4001
4002 static char proto_method_implemented(const void *method)
4003 {
4004         return method == NULL ? 'n' : 'y';
4005 }
4006 static long sock_prot_memory_allocated(struct proto *proto)
4007 {
4008         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4009 }
4010
4011 static const char *sock_prot_memory_pressure(struct proto *proto)
4012 {
4013         return proto->memory_pressure != NULL ?
4014         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4015 }
4016
4017 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4018 {
4019
4020         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4021                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4022                    proto->name,
4023                    proto->obj_size,
4024                    sock_prot_inuse_get(seq_file_net(seq), proto),
4025                    sock_prot_memory_allocated(proto),
4026                    sock_prot_memory_pressure(proto),
4027                    proto->max_header,
4028                    proto->slab == NULL ? "no" : "yes",
4029                    module_name(proto->owner),
4030                    proto_method_implemented(proto->close),
4031                    proto_method_implemented(proto->connect),
4032                    proto_method_implemented(proto->disconnect),
4033                    proto_method_implemented(proto->accept),
4034                    proto_method_implemented(proto->ioctl),
4035                    proto_method_implemented(proto->init),
4036                    proto_method_implemented(proto->destroy),
4037                    proto_method_implemented(proto->shutdown),
4038                    proto_method_implemented(proto->setsockopt),
4039                    proto_method_implemented(proto->getsockopt),
4040                    proto_method_implemented(proto->sendmsg),
4041                    proto_method_implemented(proto->recvmsg),
4042                    proto_method_implemented(proto->bind),
4043                    proto_method_implemented(proto->backlog_rcv),
4044                    proto_method_implemented(proto->hash),
4045                    proto_method_implemented(proto->unhash),
4046                    proto_method_implemented(proto->get_port),
4047                    proto_method_implemented(proto->enter_memory_pressure));
4048 }
4049
4050 static int proto_seq_show(struct seq_file *seq, void *v)
4051 {
4052         if (v == &proto_list)
4053                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4054                            "protocol",
4055                            "size",
4056                            "sockets",
4057                            "memory",
4058                            "press",
4059                            "maxhdr",
4060                            "slab",
4061                            "module",
4062                            "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4063         else
4064                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4065         return 0;
4066 }
4067
4068 static const struct seq_operations proto_seq_ops = {
4069         .start  = proto_seq_start,
4070         .next   = proto_seq_next,
4071         .stop   = proto_seq_stop,
4072         .show   = proto_seq_show,
4073 };
4074
4075 static __net_init int proto_init_net(struct net *net)
4076 {
4077         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4078                         sizeof(struct seq_net_private)))
4079                 return -ENOMEM;
4080
4081         return 0;
4082 }
4083
4084 static __net_exit void proto_exit_net(struct net *net)
4085 {
4086         remove_proc_entry("protocols", net->proc_net);
4087 }
4088
4089
4090 static __net_initdata struct pernet_operations proto_net_ops = {
4091         .init = proto_init_net,
4092         .exit = proto_exit_net,
4093 };
4094
4095 static int __init proto_init(void)
4096 {
4097         return register_pernet_subsys(&proto_net_ops);
4098 }
4099
4100 subsys_initcall(proto_init);
4101
4102 #endif /* PROC_FS */
4103
4104 #ifdef CONFIG_NET_RX_BUSY_POLL
4105 bool sk_busy_loop_end(void *p, unsigned long start_time)
4106 {
4107         struct sock *sk = p;
4108
4109         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4110                sk_busy_loop_timeout(sk, start_time);
4111 }
4112 EXPORT_SYMBOL(sk_busy_loop_end);
4113 #endif /* CONFIG_NET_RX_BUSY_POLL */
4114
4115 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4116 {
4117         if (!sk->sk_prot->bind_add)
4118                 return -EOPNOTSUPP;
4119         return sk->sk_prot->bind_add(sk, addr, addr_len);
4120 }
4121 EXPORT_SYMBOL(sock_bind_add);
4122
4123 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4124 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4125                      void __user *arg, void *karg, size_t size)
4126 {
4127         int ret;
4128
4129         if (copy_from_user(karg, arg, size))
4130                 return -EFAULT;
4131
4132         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4133         if (ret)
4134                 return ret;
4135
4136         if (copy_to_user(arg, karg, size))
4137                 return -EFAULT;
4138
4139         return 0;
4140 }
4141 EXPORT_SYMBOL(sock_ioctl_inout);
4142
4143 /* This is the most common ioctl prep function, where the result (4 bytes) is
4144  * copied back to userspace if the ioctl() returns successfully. No input is
4145  * copied from userspace as input argument.
4146  */
4147 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4148 {
4149         int ret, karg = 0;
4150
4151         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4152         if (ret)
4153                 return ret;
4154
4155         return put_user(karg, (int __user *)arg);
4156 }
4157
4158 /* A wrapper around sock ioctls, which copies the data from userspace
4159  * (depending on the protocol/ioctl), and copies back the result to userspace.
4160  * The main motivation for this function is to pass kernel memory to the
4161  * protocol ioctl callbacks, instead of userspace memory.
4162  */
4163 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4164 {
4165         int rc = 1;
4166
4167         if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4168                 rc = ipmr_sk_ioctl(sk, cmd, arg);
4169         else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4170                 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4171         else if (sk_is_phonet(sk))
4172                 rc = phonet_sk_ioctl(sk, cmd, arg);
4173
4174         /* If ioctl was processed, returns its value */
4175         if (rc <= 0)
4176                 return rc;
4177
4178         /* Otherwise call the default handler */
4179         return sock_ioctl_out(sk, cmd, arg);
4180 }
4181 EXPORT_SYMBOL(sk_ioctl);