net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/sched/mm.h>
 106 #include <linux/timer.h>
 107 #include <linux/string.h>
 108 #include <linux/sockios.h>
 109 #include <linux/net.h>
 110 #include <linux/mm.h>
 111 #include <linux/slab.h>
 112 #include <linux/interrupt.h>
 113 #include <linux/poll.h>
 114 #include <linux/tcp.h>
 115 #include <linux/init.h>
 116 #include <linux/highmem.h>
 117 #include <linux/user_namespace.h>
 118 #include <linux/static_key.h>
 119 #include <linux/memcontrol.h>
 120 #include <linux/prefetch.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <net/net_namespace.h>
 128 #include <net/request_sock.h>
 129 #include <net/sock.h>
 130 #include <linux/net_tstamp.h>
 131 #include <net/xfrm.h>
 132 #include <linux/ipsec.h>
 133 #include <net/cls_cgroup.h>
 134 #include <net/netprio_cgroup.h>
 135 #include <linux/sock_diag.h>
 136
 137 #include <linux/filter.h>
 138 #include <net/sock_reuseport.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144
 145 static DEFINE_MUTEX(proto_list_mutex);
 146 static LIST_HEAD(proto_list);
 147
 148 /**
 149  * sk_ns_capable - General socket capability test
 150  * @sk: Socket to use a capability on or through
 151  * @user_ns: The user namespace of the capability to use
 152  * @cap: The capability to use
 153  *
 154  * Test to see if the opener of the socket had when the socket was
 155  * created and the current process has the capability @cap in the user
 156  * namespace @user_ns.
 157  */
 158 bool sk_ns_capable(const struct sock *sk,
 159                    struct user_namespace *user_ns, int cap)
 160 {
 161         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                 ns_capable(user_ns, cap);
 163 }
 164 EXPORT_SYMBOL(sk_ns_capable);
 165
 166 /**
 167  * sk_capable - Socket global capability test
 168  * @sk: Socket to use a capability on or through
 169  * @cap: The global capability to use
 170  *
 171  * Test to see if the opener of the socket had when the socket was
 172  * created and the current process has the capability @cap in all user
 173  * namespaces.
 174  */
 175 bool sk_capable(const struct sock *sk, int cap)
 176 {
 177         return sk_ns_capable(sk, &init_user_ns, cap);
 178 }
 179 EXPORT_SYMBOL(sk_capable);
 180
 181 /**
 182  * sk_net_capable - Network namespace socket capability test
 183  * @sk: Socket to use a capability on or through
 184  * @cap: The capability to use
 185  *
 186  * Test to see if the opener of the socket had when the socket was created
 187  * and the current process has the capability @cap over the network namespace
 188  * the socket is a member of.
 189  */
 190 bool sk_net_capable(const struct sock *sk, int cap)
 191 {
 192         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193 }
 194 EXPORT_SYMBOL(sk_net_capable);
 195
 196 /*
 197  * Each address family might have different locking rules, so we have
 198  * one slock key per address family and separate keys for internal and
 199  * userspace sockets.
 200  */
 201 static struct lock_class_key af_family_keys[AF_MAX];
 202 static struct lock_class_key af_family_kern_keys[AF_MAX];
 203 static struct lock_class_key af_family_slock_keys[AF_MAX];
 204 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 205
 206 /*
 207  * Make lock validator output more readable. (we pre-construct these
 208  * strings build-time, so that runtime initialization of socket
 209  * locks is fast):
 210  */
 211
 212 #define _sock_locks(x)                                            \
 213   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 214   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 215   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 216   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 217   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 218   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 219   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 220   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 221   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 222   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 223   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 224   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 225   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 226   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 227   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 228
 229 static const char *const af_family_key_strings[AF_MAX+1] = {
 230         _sock_locks("sk_lock-")
 231 };
 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233         _sock_locks("slock-")
 234 };
 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236         _sock_locks("clock-")
 237 };
 238
 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240         _sock_locks("k-sk_lock-")
 241 };
 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("k-slock-")
 244 };
 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-clock-")
 247 };
 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 250   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 251   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 252   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 253   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 254   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 255   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 256   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 257   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 258   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 259   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 260   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 261   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 262   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 263   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 264 };
 265 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 266   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 267   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 268   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 269   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 270   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 271   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 272   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 273   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 274   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 275   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 276   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 277   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 278   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 279   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 280   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 281 };
 282 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 283   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 284   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 285   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 286   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 287   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 288   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 289   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 290   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 291   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 292   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 293   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 294   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 295   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 296   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 297   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 298 };
 299
 300 /*
 301  * sk_callback_lock and sk queues locking rules are per-address-family,
 302  * so split the lock classes by using a per-AF key:
 303  */
 304 static struct lock_class_key af_callback_keys[AF_MAX];
 305 static struct lock_class_key af_rlock_keys[AF_MAX];
 306 static struct lock_class_key af_wlock_keys[AF_MAX];
 307 static struct lock_class_key af_elock_keys[AF_MAX];
 308 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 309
 310 /* Take into consideration the size of the struct sk_buff overhead in the
 311  * determination of these values, since that is non-constant across
 312  * platforms.  This makes socket queueing behavior and performance
 313  * not depend upon such differences.
 314  */
 315 #define _SK_MEM_PACKETS         256
 316 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 317 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 318 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 319
 320 /* Run time adjustable parameters. */
 321 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 322 EXPORT_SYMBOL(sysctl_wmem_max);
 323 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 324 EXPORT_SYMBOL(sysctl_rmem_max);
 325 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 326 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 327
 328 /* Maximal space eaten by iovec or ancillary data plus some space */
 329 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 330 EXPORT_SYMBOL(sysctl_optmem_max);
 331
 332 int sysctl_tstamp_allow_data __read_mostly = 1;
 333
 334 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 335 EXPORT_SYMBOL_GPL(memalloc_socks);
 336
 337 /**
 338  * sk_set_memalloc - sets %SOCK_MEMALLOC
 339  * @sk: socket to set it on
 340  *
 341  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 342  * It's the responsibility of the admin to adjust min_free_kbytes
 343  * to meet the requirements
 344  */
 345 void sk_set_memalloc(struct sock *sk)
 346 {
 347         sock_set_flag(sk, SOCK_MEMALLOC);
 348         sk->sk_allocation |= __GFP_MEMALLOC;
 349         static_key_slow_inc(&memalloc_socks);
 350 }
 351 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 352
 353 void sk_clear_memalloc(struct sock *sk)
 354 {
 355         sock_reset_flag(sk, SOCK_MEMALLOC);
 356         sk->sk_allocation &= ~__GFP_MEMALLOC;
 357         static_key_slow_dec(&memalloc_socks);
 358
 359         /*
 360          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 361          * progress of swapping. SOCK_MEMALLOC may be cleared while
 362          * it has rmem allocations due to the last swapfile being deactivated
 363          * but there is a risk that the socket is unusable due to exceeding
 364          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 365          */
 366         sk_mem_reclaim(sk);
 367 }
 368 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 369
 370 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 371 {
 372         int ret;
 373         unsigned int noreclaim_flag;
 374
 375         /* these should have been dropped before queueing */
 376         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 377
 378         noreclaim_flag = memalloc_noreclaim_save();
 379         ret = sk->sk_backlog_rcv(sk, skb);
 380         memalloc_noreclaim_restore(noreclaim_flag);
 381
 382         return ret;
 383 }
 384 EXPORT_SYMBOL(__sk_backlog_rcv);
 385
 386 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 387 {
 388         struct timeval tv;
 389
 390         if (optlen < sizeof(tv))
 391                 return -EINVAL;
 392         if (copy_from_user(&tv, optval, sizeof(tv)))
 393                 return -EFAULT;
 394         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 395                 return -EDOM;
 396
 397         if (tv.tv_sec < 0) {
 398                 static int warned __read_mostly;
 399
 400                 *timeo_p = 0;
 401                 if (warned < 10 && net_ratelimit()) {
 402                         warned++;
 403                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 404                                 __func__, current->comm, task_pid_nr(current));
 405                 }
 406                 return 0;
 407         }
 408         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 409         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 410                 return 0;
 411         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 412                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 413         return 0;
 414 }
 415
 416 static void sock_warn_obsolete_bsdism(const char *name)
 417 {
 418         static int warned;
 419         static char warncomm[TASK_COMM_LEN];
 420         if (strcmp(warncomm, current->comm) && warned < 5) {
 421                 strcpy(warncomm,  current->comm);
 422                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 423                         warncomm, name);
 424                 warned++;
 425         }
 426 }
 427
 428 static bool sock_needs_netstamp(const struct sock *sk)
 429 {
 430         switch (sk->sk_family) {
 431         case AF_UNSPEC:
 432         case AF_UNIX:
 433                 return false;
 434         default:
 435                 return true;
 436         }
 437 }
 438
 439 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 440 {
 441         if (sk->sk_flags & flags) {
 442                 sk->sk_flags &= ~flags;
 443                 if (sock_needs_netstamp(sk) &&
 444                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 445                         net_disable_timestamp();
 446         }
 447 }
 448
 449
 450 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 451 {
 452         unsigned long flags;
 453         struct sk_buff_head *list = &sk->sk_receive_queue;
 454
 455         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 456                 atomic_inc(&sk->sk_drops);
 457                 trace_sock_rcvqueue_full(sk, skb);
 458                 return -ENOMEM;
 459         }
 460
 461         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 462                 atomic_inc(&sk->sk_drops);
 463                 return -ENOBUFS;
 464         }
 465
 466         skb->dev = NULL;
 467         skb_set_owner_r(skb, sk);
 468
 469         /* we escape from rcu protected region, make sure we dont leak
 470          * a norefcounted dst
 471          */
 472         skb_dst_force(skb);
 473
 474         spin_lock_irqsave(&list->lock, flags);
 475         sock_skb_set_dropcount(sk, skb);
 476         __skb_queue_tail(list, skb);
 477         spin_unlock_irqrestore(&list->lock, flags);
 478
 479         if (!sock_flag(sk, SOCK_DEAD))
 480                 sk->sk_data_ready(sk);
 481         return 0;
 482 }
 483 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 484
 485 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 486 {
 487         int err;
 488
 489         err = sk_filter(sk, skb);
 490         if (err)
 491                 return err;
 492
 493         return __sock_queue_rcv_skb(sk, skb);
 494 }
 495 EXPORT_SYMBOL(sock_queue_rcv_skb);
 496
 497 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 498                      const int nested, unsigned int trim_cap, bool refcounted)
 499 {
 500         int rc = NET_RX_SUCCESS;
 501
 502         if (sk_filter_trim_cap(sk, skb, trim_cap))
 503                 goto discard_and_relse;
 504
 505         skb->dev = NULL;
 506
 507         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 508                 atomic_inc(&sk->sk_drops);
 509                 goto discard_and_relse;
 510         }
 511         if (nested)
 512                 bh_lock_sock_nested(sk);
 513         else
 514                 bh_lock_sock(sk);
 515         if (!sock_owned_by_user(sk)) {
 516                 /*
 517                  * trylock + unlock semantics:
 518                  */
 519                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 520
 521                 rc = sk_backlog_rcv(sk, skb);
 522
 523                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 524         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 525                 bh_unlock_sock(sk);
 526                 atomic_inc(&sk->sk_drops);
 527                 goto discard_and_relse;
 528         }
 529
 530         bh_unlock_sock(sk);
 531 out:
 532         if (refcounted)
 533                 sock_put(sk);
 534         return rc;
 535 discard_and_relse:
 536         kfree_skb(skb);
 537         goto out;
 538 }
 539 EXPORT_SYMBOL(__sk_receive_skb);
 540
 541 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 542 {
 543         struct dst_entry *dst = __sk_dst_get(sk);
 544
 545         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 546                 sk_tx_queue_clear(sk);
 547                 sk->sk_dst_pending_confirm = 0;
 548                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 549                 dst_release(dst);
 550                 return NULL;
 551         }
 552
 553         return dst;
 554 }
 555 EXPORT_SYMBOL(__sk_dst_check);
 556
 557 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 558 {
 559         struct dst_entry *dst = sk_dst_get(sk);
 560
 561         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 562                 sk_dst_reset(sk);
 563                 dst_release(dst);
 564                 return NULL;
 565         }
 566
 567         return dst;
 568 }
 569 EXPORT_SYMBOL(sk_dst_check);
 570
 571 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 572                                 int optlen)
 573 {
 574         int ret = -ENOPROTOOPT;
 575 #ifdef CONFIG_NETDEVICES
 576         struct net *net = sock_net(sk);
 577         char devname[IFNAMSIZ];
 578         int index;
 579
 580         /* Sorry... */
 581         ret = -EPERM;
 582         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 583                 goto out;
 584
 585         ret = -EINVAL;
 586         if (optlen < 0)
 587                 goto out;
 588
 589         /* Bind this socket to a particular device like "eth0",
 590          * as specified in the passed interface name. If the
 591          * name is "" or the option length is zero the socket
 592          * is not bound.
 593          */
 594         if (optlen > IFNAMSIZ - 1)
 595                 optlen = IFNAMSIZ - 1;
 596         memset(devname, 0, sizeof(devname));
 597
 598         ret = -EFAULT;
 599         if (copy_from_user(devname, optval, optlen))
 600                 goto out;
 601
 602         index = 0;
 603         if (devname[0] != '\0') {
 604                 struct net_device *dev;
 605
 606                 rcu_read_lock();
 607                 dev = dev_get_by_name_rcu(net, devname);
 608                 if (dev)
 609                         index = dev->ifindex;
 610                 rcu_read_unlock();
 611                 ret = -ENODEV;
 612                 if (!dev)
 613                         goto out;
 614         }
 615
 616         lock_sock(sk);
 617         sk->sk_bound_dev_if = index;
 618         sk_dst_reset(sk);
 619         release_sock(sk);
 620
 621         ret = 0;
 622
 623 out:
 624 #endif
 625
 626         return ret;
 627 }
 628
 629 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 630                                 int __user *optlen, int len)
 631 {
 632         int ret = -ENOPROTOOPT;
 633 #ifdef CONFIG_NETDEVICES
 634         struct net *net = sock_net(sk);
 635         char devname[IFNAMSIZ];
 636
 637         if (sk->sk_bound_dev_if == 0) {
 638                 len = 0;
 639                 goto zero;
 640         }
 641
 642         ret = -EINVAL;
 643         if (len < IFNAMSIZ)
 644                 goto out;
 645
 646         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 647         if (ret)
 648                 goto out;
 649
 650         len = strlen(devname) + 1;
 651
 652         ret = -EFAULT;
 653         if (copy_to_user(optval, devname, len))
 654                 goto out;
 655
 656 zero:
 657         ret = -EFAULT;
 658         if (put_user(len, optlen))
 659                 goto out;
 660
 661         ret = 0;
 662
 663 out:
 664 #endif
 665
 666         return ret;
 667 }
 668
 669 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 670 {
 671         if (valbool)
 672                 sock_set_flag(sk, bit);
 673         else
 674                 sock_reset_flag(sk, bit);
 675 }
 676
 677 bool sk_mc_loop(struct sock *sk)
 678 {
 679         if (dev_recursion_level())
 680                 return false;
 681         if (!sk)
 682                 return true;
 683         switch (sk->sk_family) {
 684         case AF_INET:
 685                 return inet_sk(sk)->mc_loop;
 686 #if IS_ENABLED(CONFIG_IPV6)
 687         case AF_INET6:
 688                 return inet6_sk(sk)->mc_loop;
 689 #endif
 690         }
 691         WARN_ON(1);
 692         return true;
 693 }
 694 EXPORT_SYMBOL(sk_mc_loop);
 695
 696 /*
 697  *      This is meant for all protocols to use and covers goings on
 698  *      at the socket level. Everything here is generic.
 699  */
 700
 701 int sock_setsockopt(struct socket *sock, int level, int optname,
 702                     char __user *optval, unsigned int optlen)
 703 {
 704         struct sock *sk = sock->sk;
 705         int val;
 706         int valbool;
 707         struct linger ling;
 708         int ret = 0;
 709
 710         /*
 711          *      Options without arguments
 712          */
 713
 714         if (optname == SO_BINDTODEVICE)
 715                 return sock_setbindtodevice(sk, optval, optlen);
 716
 717         if (optlen < sizeof(int))
 718                 return -EINVAL;
 719
 720         if (get_user(val, (int __user *)optval))
 721                 return -EFAULT;
 722
 723         valbool = val ? 1 : 0;
 724
 725         lock_sock(sk);
 726
 727         switch (optname) {
 728         case SO_DEBUG:
 729                 if (val && !capable(CAP_NET_ADMIN))
 730                         ret = -EACCES;
 731                 else
 732                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 733                 break;
 734         case SO_REUSEADDR:
 735                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 736                 break;
 737         case SO_REUSEPORT:
 738                 sk->sk_reuseport = valbool;
 739                 break;
 740         case SO_TYPE:
 741         case SO_PROTOCOL:
 742         case SO_DOMAIN:
 743         case SO_ERROR:
 744                 ret = -ENOPROTOOPT;
 745                 break;
 746         case SO_DONTROUTE:
 747                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 748                 break;
 749         case SO_BROADCAST:
 750                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 751                 break;
 752         case SO_SNDBUF:
 753                 /* Don't error on this BSD doesn't and if you think
 754                  * about it this is right. Otherwise apps have to
 755                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 756                  * are treated in BSD as hints
 757                  */
 758                 val = min_t(u32, val, sysctl_wmem_max);
 759 set_sndbuf:
 760                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 761                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 762                 /* Wake up sending tasks if we upped the value. */
 763                 sk->sk_write_space(sk);
 764                 break;
 765
 766         case SO_SNDBUFFORCE:
 767                 if (!capable(CAP_NET_ADMIN)) {
 768                         ret = -EPERM;
 769                         break;
 770                 }
 771                 goto set_sndbuf;
 772
 773         case SO_RCVBUF:
 774                 /* Don't error on this BSD doesn't and if you think
 775                  * about it this is right. Otherwise apps have to
 776                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 777                  * are treated in BSD as hints
 778                  */
 779                 val = min_t(u32, val, sysctl_rmem_max);
 780 set_rcvbuf:
 781                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 782                 /*
 783                  * We double it on the way in to account for
 784                  * "struct sk_buff" etc. overhead.   Applications
 785                  * assume that the SO_RCVBUF setting they make will
 786                  * allow that much actual data to be received on that
 787                  * socket.
 788                  *
 789                  * Applications are unaware that "struct sk_buff" and
 790                  * other overheads allocate from the receive buffer
 791                  * during socket buffer allocation.
 792                  *
 793                  * And after considering the possible alternatives,
 794                  * returning the value we actually used in getsockopt
 795                  * is the most desirable behavior.
 796                  */
 797                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 798                 break;
 799
 800         case SO_RCVBUFFORCE:
 801                 if (!capable(CAP_NET_ADMIN)) {
 802                         ret = -EPERM;
 803                         break;
 804                 }
 805                 goto set_rcvbuf;
 806
 807         case SO_KEEPALIVE:
 808                 if (sk->sk_prot->keepalive)
 809                         sk->sk_prot->keepalive(sk, valbool);
 810                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 811                 break;
 812
 813         case SO_OOBINLINE:
 814                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 815                 break;
 816
 817         case SO_NO_CHECK:
 818                 sk->sk_no_check_tx = valbool;
 819                 break;
 820
 821         case SO_PRIORITY:
 822                 if ((val >= 0 && val <= 6) ||
 823                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 824                         sk->sk_priority = val;
 825                 else
 826                         ret = -EPERM;
 827                 break;
 828
 829         case SO_LINGER:
 830                 if (optlen < sizeof(ling)) {
 831                         ret = -EINVAL;  /* 1003.1g */
 832                         break;
 833                 }
 834                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 835                         ret = -EFAULT;
 836                         break;
 837                 }
 838                 if (!ling.l_onoff)
 839                         sock_reset_flag(sk, SOCK_LINGER);
 840                 else {
 841 #if (BITS_PER_LONG == 32)
 842                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 843                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 844                         else
 845 #endif
 846                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 847                         sock_set_flag(sk, SOCK_LINGER);
 848                 }
 849                 break;
 850
 851         case SO_BSDCOMPAT:
 852                 sock_warn_obsolete_bsdism("setsockopt");
 853                 break;
 854
 855         case SO_PASSCRED:
 856                 if (valbool)
 857                         set_bit(SOCK_PASSCRED, &sock->flags);
 858                 else
 859                         clear_bit(SOCK_PASSCRED, &sock->flags);
 860                 break;
 861
 862         case SO_TIMESTAMP:
 863         case SO_TIMESTAMPNS:
 864                 if (valbool)  {
 865                         if (optname == SO_TIMESTAMP)
 866                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 867                         else
 868                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 869                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 870                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 871                 } else {
 872                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 873                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 874                 }
 875                 break;
 876
 877         case SO_TIMESTAMPING:
 878                 if (val & ~SOF_TIMESTAMPING_MASK) {
 879                         ret = -EINVAL;
 880                         break;
 881                 }
 882
 883                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 884                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 885                         if (sk->sk_protocol == IPPROTO_TCP &&
 886                             sk->sk_type == SOCK_STREAM) {
 887                                 if ((1 << sk->sk_state) &
 888                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 889                                         ret = -EINVAL;
 890                                         break;
 891                                 }
 892                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 893                         } else {
 894                                 sk->sk_tskey = 0;
 895                         }
 896                 }
 897
 898                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 899                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 900                         ret = -EINVAL;
 901                         break;
 902                 }
 903
 904                 sk->sk_tsflags = val;
 905                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 906                         sock_enable_timestamp(sk,
 907                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 908                 else
 909                         sock_disable_timestamp(sk,
 910                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 911                 break;
 912
 913         case SO_RCVLOWAT:
 914                 if (val < 0)
 915                         val = INT_MAX;
 916                 sk->sk_rcvlowat = val ? : 1;
 917                 break;
 918
 919         case SO_RCVTIMEO:
 920                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 921                 break;
 922
 923         case SO_SNDTIMEO:
 924                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 925                 break;
 926
 927         case SO_ATTACH_FILTER:
 928                 ret = -EINVAL;
 929                 if (optlen == sizeof(struct sock_fprog)) {
 930                         struct sock_fprog fprog;
 931
 932                         ret = -EFAULT;
 933                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 934                                 break;
 935
 936                         ret = sk_attach_filter(&fprog, sk);
 937                 }
 938                 break;
 939
 940         case SO_ATTACH_BPF:
 941                 ret = -EINVAL;
 942                 if (optlen == sizeof(u32)) {
 943                         u32 ufd;
 944
 945                         ret = -EFAULT;
 946                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 947                                 break;
 948
 949                         ret = sk_attach_bpf(ufd, sk);
 950                 }
 951                 break;
 952
 953         case SO_ATTACH_REUSEPORT_CBPF:
 954                 ret = -EINVAL;
 955                 if (optlen == sizeof(struct sock_fprog)) {
 956                         struct sock_fprog fprog;
 957
 958                         ret = -EFAULT;
 959                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 960                                 break;
 961
 962                         ret = sk_reuseport_attach_filter(&fprog, sk);
 963                 }
 964                 break;
 965
 966         case SO_ATTACH_REUSEPORT_EBPF:
 967                 ret = -EINVAL;
 968                 if (optlen == sizeof(u32)) {
 969                         u32 ufd;
 970
 971                         ret = -EFAULT;
 972                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 973                                 break;
 974
 975                         ret = sk_reuseport_attach_bpf(ufd, sk);
 976                 }
 977                 break;
 978
 979         case SO_DETACH_FILTER:
 980                 ret = sk_detach_filter(sk);
 981                 break;
 982
 983         case SO_LOCK_FILTER:
 984                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 985                         ret = -EPERM;
 986                 else
 987                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 988                 break;
 989
 990         case SO_PASSSEC:
 991                 if (valbool)
 992                         set_bit(SOCK_PASSSEC, &sock->flags);
 993                 else
 994                         clear_bit(SOCK_PASSSEC, &sock->flags);
 995                 break;
 996         case SO_MARK:
 997                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 998                         ret = -EPERM;
 999                 else
1000                         sk->sk_mark = val;
1001                 break;
1002
1003         case SO_RXQ_OVFL:
1004                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1005                 break;
1006
1007         case SO_WIFI_STATUS:
1008                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1009                 break;
1010
1011         case SO_PEEK_OFF:
1012                 if (sock->ops->set_peek_off)
1013                         ret = sock->ops->set_peek_off(sk, val);
1014                 else
1015                         ret = -EOPNOTSUPP;
1016                 break;
1017
1018         case SO_NOFCS:
1019                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1020                 break;
1021
1022         case SO_SELECT_ERR_QUEUE:
1023                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1024                 break;
1025
1026 #ifdef CONFIG_NET_RX_BUSY_POLL
1027         case SO_BUSY_POLL:
1028                 /* allow unprivileged users to decrease the value */
1029                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1030                         ret = -EPERM;
1031                 else {
1032                         if (val < 0)
1033                                 ret = -EINVAL;
1034                         else
1035                                 sk->sk_ll_usec = val;
1036                 }
1037                 break;
1038 #endif
1039
1040         case SO_MAX_PACING_RATE:
1041                 if (val != ~0U)
1042                         cmpxchg(&sk->sk_pacing_status,
1043                                 SK_PACING_NONE,
1044                                 SK_PACING_NEEDED);
1045                 sk->sk_max_pacing_rate = val;
1046                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1047                                          sk->sk_max_pacing_rate);
1048                 break;
1049
1050         case SO_INCOMING_CPU:
1051                 sk->sk_incoming_cpu = val;
1052                 break;
1053
1054         case SO_CNX_ADVICE:
1055                 if (val == 1)
1056                         dst_negative_advice(sk);
1057                 break;
1058         default:
1059                 ret = -ENOPROTOOPT;
1060                 break;
1061         }
1062         release_sock(sk);
1063         return ret;
1064 }
1065 EXPORT_SYMBOL(sock_setsockopt);
1066
1067
1068 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1069                           struct ucred *ucred)
1070 {
1071         ucred->pid = pid_vnr(pid);
1072         ucred->uid = ucred->gid = -1;
1073         if (cred) {
1074                 struct user_namespace *current_ns = current_user_ns();
1075
1076                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1077                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1078         }
1079 }
1080
1081 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1082 {
1083         struct user_namespace *user_ns = current_user_ns();
1084         int i;
1085
1086         for (i = 0; i < src->ngroups; i++)
1087                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1088                         return -EFAULT;
1089
1090         return 0;
1091 }
1092
1093 int sock_getsockopt(struct socket *sock, int level, int optname,
1094                     char __user *optval, int __user *optlen)
1095 {
1096         struct sock *sk = sock->sk;
1097
1098         union {
1099                 int val;
1100                 u64 val64;
1101                 struct linger ling;
1102                 struct timeval tm;
1103         } v;
1104
1105         int lv = sizeof(int);
1106         int len;
1107
1108         if (get_user(len, optlen))
1109                 return -EFAULT;
1110         if (len < 0)
1111                 return -EINVAL;
1112
1113         memset(&v, 0, sizeof(v));
1114
1115         switch (optname) {
1116         case SO_DEBUG:
1117                 v.val = sock_flag(sk, SOCK_DBG);
1118                 break;
1119
1120         case SO_DONTROUTE:
1121                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1122                 break;
1123
1124         case SO_BROADCAST:
1125                 v.val = sock_flag(sk, SOCK_BROADCAST);
1126                 break;
1127
1128         case SO_SNDBUF:
1129                 v.val = sk->sk_sndbuf;
1130                 break;
1131
1132         case SO_RCVBUF:
1133                 v.val = sk->sk_rcvbuf;
1134                 break;
1135
1136         case SO_REUSEADDR:
1137                 v.val = sk->sk_reuse;
1138                 break;
1139
1140         case SO_REUSEPORT:
1141                 v.val = sk->sk_reuseport;
1142                 break;
1143
1144         case SO_KEEPALIVE:
1145                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1146                 break;
1147
1148         case SO_TYPE:
1149                 v.val = sk->sk_type;
1150                 break;
1151
1152         case SO_PROTOCOL:
1153                 v.val = sk->sk_protocol;
1154                 break;
1155
1156         case SO_DOMAIN:
1157                 v.val = sk->sk_family;
1158                 break;
1159
1160         case SO_ERROR:
1161                 v.val = -sock_error(sk);
1162                 if (v.val == 0)
1163                         v.val = xchg(&sk->sk_err_soft, 0);
1164                 break;
1165
1166         case SO_OOBINLINE:
1167                 v.val = sock_flag(sk, SOCK_URGINLINE);
1168                 break;
1169
1170         case SO_NO_CHECK:
1171                 v.val = sk->sk_no_check_tx;
1172                 break;
1173
1174         case SO_PRIORITY:
1175                 v.val = sk->sk_priority;
1176                 break;
1177
1178         case SO_LINGER:
1179                 lv              = sizeof(v.ling);
1180                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1181                 v.ling.l_linger = sk->sk_lingertime / HZ;
1182                 break;
1183
1184         case SO_BSDCOMPAT:
1185                 sock_warn_obsolete_bsdism("getsockopt");
1186                 break;
1187
1188         case SO_TIMESTAMP:
1189                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1190                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1191                 break;
1192
1193         case SO_TIMESTAMPNS:
1194                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1195                 break;
1196
1197         case SO_TIMESTAMPING:
1198                 v.val = sk->sk_tsflags;
1199                 break;
1200
1201         case SO_RCVTIMEO:
1202                 lv = sizeof(struct timeval);
1203                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1204                         v.tm.tv_sec = 0;
1205                         v.tm.tv_usec = 0;
1206                 } else {
1207                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1208                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1209                 }
1210                 break;
1211
1212         case SO_SNDTIMEO:
1213                 lv = sizeof(struct timeval);
1214                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1215                         v.tm.tv_sec = 0;
1216                         v.tm.tv_usec = 0;
1217                 } else {
1218                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1219                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1220                 }
1221                 break;
1222
1223         case SO_RCVLOWAT:
1224                 v.val = sk->sk_rcvlowat;
1225                 break;
1226
1227         case SO_SNDLOWAT:
1228                 v.val = 1;
1229                 break;
1230
1231         case SO_PASSCRED:
1232                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1233                 break;
1234
1235         case SO_PEERCRED:
1236         {
1237                 struct ucred peercred;
1238                 if (len > sizeof(peercred))
1239                         len = sizeof(peercred);
1240                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1241                 if (copy_to_user(optval, &peercred, len))
1242                         return -EFAULT;
1243                 goto lenout;
1244         }
1245
1246         case SO_PEERGROUPS:
1247         {
1248                 int ret, n;
1249
1250                 if (!sk->sk_peer_cred)
1251                         return -ENODATA;
1252
1253                 n = sk->sk_peer_cred->group_info->ngroups;
1254                 if (len < n * sizeof(gid_t)) {
1255                         len = n * sizeof(gid_t);
1256                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1257                 }
1258                 len = n * sizeof(gid_t);
1259
1260                 ret = groups_to_user((gid_t __user *)optval,
1261                                      sk->sk_peer_cred->group_info);
1262                 if (ret)
1263                         return ret;
1264                 goto lenout;
1265         }
1266
1267         case SO_PEERNAME:
1268         {
1269                 char address[128];
1270
1271                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1272                         return -ENOTCONN;
1273                 if (lv < len)
1274                         return -EINVAL;
1275                 if (copy_to_user(optval, address, len))
1276                         return -EFAULT;
1277                 goto lenout;
1278         }
1279
1280         /* Dubious BSD thing... Probably nobody even uses it, but
1281          * the UNIX standard wants it for whatever reason... -DaveM
1282          */
1283         case SO_ACCEPTCONN:
1284                 v.val = sk->sk_state == TCP_LISTEN;
1285                 break;
1286
1287         case SO_PASSSEC:
1288                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1289                 break;
1290
1291         case SO_PEERSEC:
1292                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1293
1294         case SO_MARK:
1295                 v.val = sk->sk_mark;
1296                 break;
1297
1298         case SO_RXQ_OVFL:
1299                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1300                 break;
1301
1302         case SO_WIFI_STATUS:
1303                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1304                 break;
1305
1306         case SO_PEEK_OFF:
1307                 if (!sock->ops->set_peek_off)
1308                         return -EOPNOTSUPP;
1309
1310                 v.val = sk->sk_peek_off;
1311                 break;
1312         case SO_NOFCS:
1313                 v.val = sock_flag(sk, SOCK_NOFCS);
1314                 break;
1315
1316         case SO_BINDTODEVICE:
1317                 return sock_getbindtodevice(sk, optval, optlen, len);
1318
1319         case SO_GET_FILTER:
1320                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1321                 if (len < 0)
1322                         return len;
1323
1324                 goto lenout;
1325
1326         case SO_LOCK_FILTER:
1327                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1328                 break;
1329
1330         case SO_BPF_EXTENSIONS:
1331                 v.val = bpf_tell_extensions();
1332                 break;
1333
1334         case SO_SELECT_ERR_QUEUE:
1335                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1336                 break;
1337
1338 #ifdef CONFIG_NET_RX_BUSY_POLL
1339         case SO_BUSY_POLL:
1340                 v.val = sk->sk_ll_usec;
1341                 break;
1342 #endif
1343
1344         case SO_MAX_PACING_RATE:
1345                 v.val = sk->sk_max_pacing_rate;
1346                 break;
1347
1348         case SO_INCOMING_CPU:
1349                 v.val = sk->sk_incoming_cpu;
1350                 break;
1351
1352         case SO_MEMINFO:
1353         {
1354                 u32 meminfo[SK_MEMINFO_VARS];
1355
1356                 if (get_user(len, optlen))
1357                         return -EFAULT;
1358
1359                 sk_get_meminfo(sk, meminfo);
1360
1361                 len = min_t(unsigned int, len, sizeof(meminfo));
1362                 if (copy_to_user(optval, &meminfo, len))
1363                         return -EFAULT;
1364
1365                 goto lenout;
1366         }
1367
1368 #ifdef CONFIG_NET_RX_BUSY_POLL
1369         case SO_INCOMING_NAPI_ID:
1370                 v.val = READ_ONCE(sk->sk_napi_id);
1371
1372                 /* aggregate non-NAPI IDs down to 0 */
1373                 if (v.val < MIN_NAPI_ID)
1374                         v.val = 0;
1375
1376                 break;
1377 #endif
1378
1379         case SO_COOKIE:
1380                 lv = sizeof(u64);
1381                 if (len < lv)
1382                         return -EINVAL;
1383                 v.val64 = sock_gen_cookie(sk);
1384                 break;
1385
1386         default:
1387                 /* We implement the SO_SNDLOWAT etc to not be settable
1388                  * (1003.1g 7).
1389                  */
1390                 return -ENOPROTOOPT;
1391         }
1392
1393         if (len > lv)
1394                 len = lv;
1395         if (copy_to_user(optval, &v, len))
1396                 return -EFAULT;
1397 lenout:
1398         if (put_user(len, optlen))
1399                 return -EFAULT;
1400         return 0;
1401 }
1402
1403 /*
1404  * Initialize an sk_lock.
1405  *
1406  * (We also register the sk_lock with the lock validator.)
1407  */
1408 static inline void sock_lock_init(struct sock *sk)
1409 {
1410         if (sk->sk_kern_sock)
1411                 sock_lock_init_class_and_name(
1412                         sk,
1413                         af_family_kern_slock_key_strings[sk->sk_family],
1414                         af_family_kern_slock_keys + sk->sk_family,
1415                         af_family_kern_key_strings[sk->sk_family],
1416                         af_family_kern_keys + sk->sk_family);
1417         else
1418                 sock_lock_init_class_and_name(
1419                         sk,
1420                         af_family_slock_key_strings[sk->sk_family],
1421                         af_family_slock_keys + sk->sk_family,
1422                         af_family_key_strings[sk->sk_family],
1423                         af_family_keys + sk->sk_family);
1424 }
1425
1426 /*
1427  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1428  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1429  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1430  */
1431 static void sock_copy(struct sock *nsk, const struct sock *osk)
1432 {
1433 #ifdef CONFIG_SECURITY_NETWORK
1434         void *sptr = nsk->sk_security;
1435 #endif
1436         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1437
1438         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1439                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1440
1441 #ifdef CONFIG_SECURITY_NETWORK
1442         nsk->sk_security = sptr;
1443         security_sk_clone(osk, nsk);
1444 #endif
1445 }
1446
1447 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1448                 int family)
1449 {
1450         struct sock *sk;
1451         struct kmem_cache *slab;
1452
1453         slab = prot->slab;
1454         if (slab != NULL) {
1455                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1456                 if (!sk)
1457                         return sk;
1458                 if (priority & __GFP_ZERO)
1459                         sk_prot_clear_nulls(sk, prot->obj_size);
1460         } else
1461                 sk = kmalloc(prot->obj_size, priority);
1462
1463         if (sk != NULL) {
1464                 kmemcheck_annotate_bitfield(sk, flags);
1465
1466                 if (security_sk_alloc(sk, family, priority))
1467                         goto out_free;
1468
1469                 if (!try_module_get(prot->owner))
1470                         goto out_free_sec;
1471                 sk_tx_queue_clear(sk);
1472         }
1473
1474         return sk;
1475
1476 out_free_sec:
1477         security_sk_free(sk);
1478 out_free:
1479         if (slab != NULL)
1480                 kmem_cache_free(slab, sk);
1481         else
1482                 kfree(sk);
1483         return NULL;
1484 }
1485
1486 static void sk_prot_free(struct proto *prot, struct sock *sk)
1487 {
1488         struct kmem_cache *slab;
1489         struct module *owner;
1490
1491         owner = prot->owner;
1492         slab = prot->slab;
1493
1494         cgroup_sk_free(&sk->sk_cgrp_data);
1495         mem_cgroup_sk_free(sk);
1496         security_sk_free(sk);
1497         if (slab != NULL)
1498                 kmem_cache_free(slab, sk);
1499         else
1500                 kfree(sk);
1501         module_put(owner);
1502 }
1503
1504 /**
1505  *      sk_alloc - All socket objects are allocated here
1506  *      @net: the applicable net namespace
1507  *      @family: protocol family
1508  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1509  *      @prot: struct proto associated with this new sock instance
1510  *      @kern: is this to be a kernel socket?
1511  */
1512 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1513                       struct proto *prot, int kern)
1514 {
1515         struct sock *sk;
1516
1517         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1518         if (sk) {
1519                 sk->sk_family = family;
1520                 /*
1521                  * See comment in struct sock definition to understand
1522                  * why we need sk_prot_creator -acme
1523                  */
1524                 sk->sk_prot = sk->sk_prot_creator = prot;
1525                 sk->sk_kern_sock = kern;
1526                 sock_lock_init(sk);
1527                 sk->sk_net_refcnt = kern ? 0 : 1;
1528                 if (likely(sk->sk_net_refcnt))
1529                         get_net(net);
1530                 sock_net_set(sk, net);
1531                 refcount_set(&sk->sk_wmem_alloc, 1);
1532
1533                 mem_cgroup_sk_alloc(sk);
1534                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1535                 sock_update_classid(&sk->sk_cgrp_data);
1536                 sock_update_netprioidx(&sk->sk_cgrp_data);
1537         }
1538
1539         return sk;
1540 }
1541 EXPORT_SYMBOL(sk_alloc);
1542
1543 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1544  * grace period. This is the case for UDP sockets and TCP listeners.
1545  */
1546 static void __sk_destruct(struct rcu_head *head)
1547 {
1548         struct sock *sk = container_of(head, struct sock, sk_rcu);
1549         struct sk_filter *filter;
1550
1551         if (sk->sk_destruct)
1552                 sk->sk_destruct(sk);
1553
1554         filter = rcu_dereference_check(sk->sk_filter,
1555                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1556         if (filter) {
1557                 sk_filter_uncharge(sk, filter);
1558                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1559         }
1560         if (rcu_access_pointer(sk->sk_reuseport_cb))
1561                 reuseport_detach_sock(sk);
1562
1563         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1564
1565         if (atomic_read(&sk->sk_omem_alloc))
1566                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1567                          __func__, atomic_read(&sk->sk_omem_alloc));
1568
1569         if (sk->sk_frag.page) {
1570                 put_page(sk->sk_frag.page);
1571                 sk->sk_frag.page = NULL;
1572         }
1573
1574         if (sk->sk_peer_cred)
1575                 put_cred(sk->sk_peer_cred);
1576         put_pid(sk->sk_peer_pid);
1577         if (likely(sk->sk_net_refcnt))
1578                 put_net(sock_net(sk));
1579         sk_prot_free(sk->sk_prot_creator, sk);
1580 }
1581
1582 void sk_destruct(struct sock *sk)
1583 {
1584         if (sock_flag(sk, SOCK_RCU_FREE))
1585                 call_rcu(&sk->sk_rcu, __sk_destruct);
1586         else
1587                 __sk_destruct(&sk->sk_rcu);
1588 }
1589
1590 static void __sk_free(struct sock *sk)
1591 {
1592         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1593                 sock_diag_broadcast_destroy(sk);
1594         else
1595                 sk_destruct(sk);
1596 }
1597
1598 void sk_free(struct sock *sk)
1599 {
1600         /*
1601          * We subtract one from sk_wmem_alloc and can know if
1602          * some packets are still in some tx queue.
1603          * If not null, sock_wfree() will call __sk_free(sk) later
1604          */
1605         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1606                 __sk_free(sk);
1607 }
1608 EXPORT_SYMBOL(sk_free);
1609
1610 static void sk_init_common(struct sock *sk)
1611 {
1612         skb_queue_head_init(&sk->sk_receive_queue);
1613         skb_queue_head_init(&sk->sk_write_queue);
1614         skb_queue_head_init(&sk->sk_error_queue);
1615
1616         rwlock_init(&sk->sk_callback_lock);
1617         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1618                         af_rlock_keys + sk->sk_family,
1619                         af_family_rlock_key_strings[sk->sk_family]);
1620         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1621                         af_wlock_keys + sk->sk_family,
1622                         af_family_wlock_key_strings[sk->sk_family]);
1623         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1624                         af_elock_keys + sk->sk_family,
1625                         af_family_elock_key_strings[sk->sk_family]);
1626         lockdep_set_class_and_name(&sk->sk_callback_lock,
1627                         af_callback_keys + sk->sk_family,
1628                         af_family_clock_key_strings[sk->sk_family]);
1629 }
1630
1631 /**
1632  *      sk_clone_lock - clone a socket, and lock its clone
1633  *      @sk: the socket to clone
1634  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1635  *
1636  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1637  */
1638 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1639 {
1640         struct sock *newsk;
1641         bool is_charged = true;
1642
1643         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1644         if (newsk != NULL) {
1645                 struct sk_filter *filter;
1646
1647                 sock_copy(newsk, sk);
1648
1649                 /* SANITY */
1650                 if (likely(newsk->sk_net_refcnt))
1651                         get_net(sock_net(newsk));
1652                 sk_node_init(&newsk->sk_node);
1653                 sock_lock_init(newsk);
1654                 bh_lock_sock(newsk);
1655                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1656                 newsk->sk_backlog.len = 0;
1657
1658                 atomic_set(&newsk->sk_rmem_alloc, 0);
1659                 /*
1660                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1661                  */
1662                 refcount_set(&newsk->sk_wmem_alloc, 1);
1663                 atomic_set(&newsk->sk_omem_alloc, 0);
1664                 sk_init_common(newsk);
1665
1666                 newsk->sk_dst_cache     = NULL;
1667                 newsk->sk_dst_pending_confirm = 0;
1668                 newsk->sk_wmem_queued   = 0;
1669                 newsk->sk_forward_alloc = 0;
1670                 atomic_set(&newsk->sk_drops, 0);
1671                 newsk->sk_send_head     = NULL;
1672                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1673
1674                 sock_reset_flag(newsk, SOCK_DONE);
1675
1676                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1677                 if (filter != NULL)
1678                         /* though it's an empty new sock, the charging may fail
1679                          * if sysctl_optmem_max was changed between creation of
1680                          * original socket and cloning
1681                          */
1682                         is_charged = sk_filter_charge(newsk, filter);
1683
1684                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1685                         /* We need to make sure that we don't uncharge the new
1686                          * socket if we couldn't charge it in the first place
1687                          * as otherwise we uncharge the parent's filter.
1688                          */
1689                         if (!is_charged)
1690                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1691                         sk_free_unlock_clone(newsk);
1692                         newsk = NULL;
1693                         goto out;
1694                 }
1695                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1696
1697                 newsk->sk_err      = 0;
1698                 newsk->sk_err_soft = 0;
1699                 newsk->sk_priority = 0;
1700                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1701                 atomic64_set(&newsk->sk_cookie, 0);
1702
1703                 mem_cgroup_sk_alloc(newsk);
1704                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1705
1706                 /*
1707                  * Before updating sk_refcnt, we must commit prior changes to memory
1708                  * (Documentation/RCU/rculist_nulls.txt for details)
1709                  */
1710                 smp_wmb();
1711                 refcount_set(&newsk->sk_refcnt, 2);
1712
1713                 /*
1714                  * Increment the counter in the same struct proto as the master
1715                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1716                  * is the same as sk->sk_prot->socks, as this field was copied
1717                  * with memcpy).
1718                  *
1719                  * This _changes_ the previous behaviour, where
1720                  * tcp_create_openreq_child always was incrementing the
1721                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1722                  * to be taken into account in all callers. -acme
1723                  */
1724                 sk_refcnt_debug_inc(newsk);
1725                 sk_set_socket(newsk, NULL);
1726                 newsk->sk_wq = NULL;
1727
1728                 if (newsk->sk_prot->sockets_allocated)
1729                         sk_sockets_allocated_inc(newsk);
1730
1731                 if (sock_needs_netstamp(sk) &&
1732                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1733                         net_enable_timestamp();
1734         }
1735 out:
1736         return newsk;
1737 }
1738 EXPORT_SYMBOL_GPL(sk_clone_lock);
1739
1740 void sk_free_unlock_clone(struct sock *sk)
1741 {
1742         /* It is still raw copy of parent, so invalidate
1743          * destructor and make plain sk_free() */
1744         sk->sk_destruct = NULL;
1745         bh_unlock_sock(sk);
1746         sk_free(sk);
1747 }
1748 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1749
1750 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1751 {
1752         u32 max_segs = 1;
1753
1754         sk_dst_set(sk, dst);
1755         sk->sk_route_caps = dst->dev->features;
1756         if (sk->sk_route_caps & NETIF_F_GSO)
1757                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1758         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1759         if (sk_can_gso(sk)) {
1760                 if (dst->header_len) {
1761                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1762                 } else {
1763                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1764                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1765                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1766                 }
1767         }
1768         sk->sk_gso_max_segs = max_segs;
1769 }
1770 EXPORT_SYMBOL_GPL(sk_setup_caps);
1771
1772 /*
1773  *      Simple resource managers for sockets.
1774  */
1775
1776
1777 /*
1778  * Write buffer destructor automatically called from kfree_skb.
1779  */
1780 void sock_wfree(struct sk_buff *skb)
1781 {
1782         struct sock *sk = skb->sk;
1783         unsigned int len = skb->truesize;
1784
1785         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1786                 /*
1787                  * Keep a reference on sk_wmem_alloc, this will be released
1788                  * after sk_write_space() call
1789                  */
1790                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1791                 sk->sk_write_space(sk);
1792                 len = 1;
1793         }
1794         /*
1795          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1796          * could not do because of in-flight packets
1797          */
1798         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1799                 __sk_free(sk);
1800 }
1801 EXPORT_SYMBOL(sock_wfree);
1802
1803 /* This variant of sock_wfree() is used by TCP,
1804  * since it sets SOCK_USE_WRITE_QUEUE.
1805  */
1806 void __sock_wfree(struct sk_buff *skb)
1807 {
1808         struct sock *sk = skb->sk;
1809
1810         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1811                 __sk_free(sk);
1812 }
1813
1814 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1815 {
1816         skb_orphan(skb);
1817         skb->sk = sk;
1818 #ifdef CONFIG_INET
1819         if (unlikely(!sk_fullsock(sk))) {
1820                 skb->destructor = sock_edemux;
1821                 sock_hold(sk);
1822                 return;
1823         }
1824 #endif
1825         skb->destructor = sock_wfree;
1826         skb_set_hash_from_sk(skb, sk);
1827         /*
1828          * We used to take a refcount on sk, but following operation
1829          * is enough to guarantee sk_free() wont free this sock until
1830          * all in-flight packets are completed
1831          */
1832         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1833 }
1834 EXPORT_SYMBOL(skb_set_owner_w);
1835
1836 /* This helper is used by netem, as it can hold packets in its
1837  * delay queue. We want to allow the owner socket to send more
1838  * packets, as if they were already TX completed by a typical driver.
1839  * But we also want to keep skb->sk set because some packet schedulers
1840  * rely on it (sch_fq for example).
1841  */
1842 void skb_orphan_partial(struct sk_buff *skb)
1843 {
1844         if (skb_is_tcp_pure_ack(skb))
1845                 return;
1846
1847         if (skb->destructor == sock_wfree
1848 #ifdef CONFIG_INET
1849             || skb->destructor == tcp_wfree
1850 #endif
1851                 ) {
1852                 struct sock *sk = skb->sk;
1853
1854                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1855                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1856                         skb->destructor = sock_efree;
1857                 }
1858         } else {
1859                 skb_orphan(skb);
1860         }
1861 }
1862 EXPORT_SYMBOL(skb_orphan_partial);
1863
1864 /*
1865  * Read buffer destructor automatically called from kfree_skb.
1866  */
1867 void sock_rfree(struct sk_buff *skb)
1868 {
1869         struct sock *sk = skb->sk;
1870         unsigned int len = skb->truesize;
1871
1872         atomic_sub(len, &sk->sk_rmem_alloc);
1873         sk_mem_uncharge(sk, len);
1874 }
1875 EXPORT_SYMBOL(sock_rfree);
1876
1877 /*
1878  * Buffer destructor for skbs that are not used directly in read or write
1879  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1880  */
1881 void sock_efree(struct sk_buff *skb)
1882 {
1883         sock_put(skb->sk);
1884 }
1885 EXPORT_SYMBOL(sock_efree);
1886
1887 kuid_t sock_i_uid(struct sock *sk)
1888 {
1889         kuid_t uid;
1890
1891         read_lock_bh(&sk->sk_callback_lock);
1892         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1893         read_unlock_bh(&sk->sk_callback_lock);
1894         return uid;
1895 }
1896 EXPORT_SYMBOL(sock_i_uid);
1897
1898 unsigned long sock_i_ino(struct sock *sk)
1899 {
1900         unsigned long ino;
1901
1902         read_lock_bh(&sk->sk_callback_lock);
1903         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1904         read_unlock_bh(&sk->sk_callback_lock);
1905         return ino;
1906 }
1907 EXPORT_SYMBOL(sock_i_ino);
1908
1909 /*
1910  * Allocate a skb from the socket's send buffer.
1911  */
1912 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1913                              gfp_t priority)
1914 {
1915         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1916                 struct sk_buff *skb = alloc_skb(size, priority);
1917                 if (skb) {
1918                         skb_set_owner_w(skb, sk);
1919                         return skb;
1920                 }
1921         }
1922         return NULL;
1923 }
1924 EXPORT_SYMBOL(sock_wmalloc);
1925
1926 /*
1927  * Allocate a memory block from the socket's option memory buffer.
1928  */
1929 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1930 {
1931         if ((unsigned int)size <= sysctl_optmem_max &&
1932             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1933                 void *mem;
1934                 /* First do the add, to avoid the race if kmalloc
1935                  * might sleep.
1936                  */
1937                 atomic_add(size, &sk->sk_omem_alloc);
1938                 mem = kmalloc(size, priority);
1939                 if (mem)
1940                         return mem;
1941                 atomic_sub(size, &sk->sk_omem_alloc);
1942         }
1943         return NULL;
1944 }
1945 EXPORT_SYMBOL(sock_kmalloc);
1946
1947 /* Free an option memory block. Note, we actually want the inline
1948  * here as this allows gcc to detect the nullify and fold away the
1949  * condition entirely.
1950  */
1951 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1952                                   const bool nullify)
1953 {
1954         if (WARN_ON_ONCE(!mem))
1955                 return;
1956         if (nullify)
1957                 kzfree(mem);
1958         else
1959                 kfree(mem);
1960         atomic_sub(size, &sk->sk_omem_alloc);
1961 }
1962
1963 void sock_kfree_s(struct sock *sk, void *mem, int size)
1964 {
1965         __sock_kfree_s(sk, mem, size, false);
1966 }
1967 EXPORT_SYMBOL(sock_kfree_s);
1968
1969 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1970 {
1971         __sock_kfree_s(sk, mem, size, true);
1972 }
1973 EXPORT_SYMBOL(sock_kzfree_s);
1974
1975 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1976    I think, these locks should be removed for datagram sockets.
1977  */
1978 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1979 {
1980         DEFINE_WAIT(wait);
1981
1982         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1983         for (;;) {
1984                 if (!timeo)
1985                         break;
1986                 if (signal_pending(current))
1987                         break;
1988                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1989                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1990                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1991                         break;
1992                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1993                         break;
1994                 if (sk->sk_err)
1995                         break;
1996                 timeo = schedule_timeout(timeo);
1997         }
1998         finish_wait(sk_sleep(sk), &wait);
1999         return timeo;
2000 }
2001
2002
2003 /*
2004  *      Generic send/receive buffer handlers
2005  */
2006
2007 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2008                                      unsigned long data_len, int noblock,
2009                                      int *errcode, int max_page_order)
2010 {
2011         struct sk_buff *skb;
2012         long timeo;
2013         int err;
2014
2015         timeo = sock_sndtimeo(sk, noblock);
2016         for (;;) {
2017                 err = sock_error(sk);
2018                 if (err != 0)
2019                         goto failure;
2020
2021                 err = -EPIPE;
2022                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2023                         goto failure;
2024
2025                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2026                         break;
2027
2028                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2029                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2030                 err = -EAGAIN;
2031                 if (!timeo)
2032                         goto failure;
2033                 if (signal_pending(current))
2034                         goto interrupted;
2035                 timeo = sock_wait_for_wmem(sk, timeo);
2036         }
2037         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2038                                    errcode, sk->sk_allocation);
2039         if (skb)
2040                 skb_set_owner_w(skb, sk);
2041         return skb;
2042
2043 interrupted:
2044         err = sock_intr_errno(timeo);
2045 failure:
2046         *errcode = err;
2047         return NULL;
2048 }
2049 EXPORT_SYMBOL(sock_alloc_send_pskb);
2050
2051 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2052                                     int noblock, int *errcode)
2053 {
2054         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2055 }
2056 EXPORT_SYMBOL(sock_alloc_send_skb);
2057
2058 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2059                      struct sockcm_cookie *sockc)
2060 {
2061         u32 tsflags;
2062
2063         switch (cmsg->cmsg_type) {
2064         case SO_MARK:
2065                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2066                         return -EPERM;
2067                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2068                         return -EINVAL;
2069                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2070                 break;
2071         case SO_TIMESTAMPING:
2072                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2073                         return -EINVAL;
2074
2075                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2076                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2077                         return -EINVAL;
2078
2079                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2080                 sockc->tsflags |= tsflags;
2081                 break;
2082         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2083         case SCM_RIGHTS:
2084         case SCM_CREDENTIALS:
2085                 break;
2086         default:
2087                 return -EINVAL;
2088         }
2089         return 0;
2090 }
2091 EXPORT_SYMBOL(__sock_cmsg_send);
2092
2093 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2094                    struct sockcm_cookie *sockc)
2095 {
2096         struct cmsghdr *cmsg;
2097         int ret;
2098
2099         for_each_cmsghdr(cmsg, msg) {
2100                 if (!CMSG_OK(msg, cmsg))
2101                         return -EINVAL;
2102                 if (cmsg->cmsg_level != SOL_SOCKET)
2103                         continue;
2104                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2105                 if (ret)
2106                         return ret;
2107         }
2108         return 0;
2109 }
2110 EXPORT_SYMBOL(sock_cmsg_send);
2111
2112 static void sk_enter_memory_pressure(struct sock *sk)
2113 {
2114         if (!sk->sk_prot->enter_memory_pressure)
2115                 return;
2116
2117         sk->sk_prot->enter_memory_pressure(sk);
2118 }
2119
2120 static void sk_leave_memory_pressure(struct sock *sk)
2121 {
2122         if (sk->sk_prot->leave_memory_pressure) {
2123                 sk->sk_prot->leave_memory_pressure(sk);
2124         } else {
2125                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2126
2127                 if (memory_pressure && *memory_pressure)
2128                         *memory_pressure = 0;
2129         }
2130 }
2131
2132 /* On 32bit arches, an skb frag is limited to 2^15 */
2133 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2134
2135 /**
2136  * skb_page_frag_refill - check that a page_frag contains enough room
2137  * @sz: minimum size of the fragment we want to get
2138  * @pfrag: pointer to page_frag
2139  * @gfp: priority for memory allocation
2140  *
2141  * Note: While this allocator tries to use high order pages, there is
2142  * no guarantee that allocations succeed. Therefore, @sz MUST be
2143  * less or equal than PAGE_SIZE.
2144  */
2145 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2146 {
2147         if (pfrag->page) {
2148                 if (page_ref_count(pfrag->page) == 1) {
2149                         pfrag->offset = 0;
2150                         return true;
2151                 }
2152                 if (pfrag->offset + sz <= pfrag->size)
2153                         return true;
2154                 put_page(pfrag->page);
2155         }
2156
2157         pfrag->offset = 0;
2158         if (SKB_FRAG_PAGE_ORDER) {
2159                 /* Avoid direct reclaim but allow kswapd to wake */
2160                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2161                                           __GFP_COMP | __GFP_NOWARN |
2162                                           __GFP_NORETRY,
2163                                           SKB_FRAG_PAGE_ORDER);
2164                 if (likely(pfrag->page)) {
2165                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2166                         return true;
2167                 }
2168         }
2169         pfrag->page = alloc_page(gfp);
2170         if (likely(pfrag->page)) {
2171                 pfrag->size = PAGE_SIZE;
2172                 return true;
2173         }
2174         return false;
2175 }
2176 EXPORT_SYMBOL(skb_page_frag_refill);
2177
2178 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2179 {
2180         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2181                 return true;
2182
2183         sk_enter_memory_pressure(sk);
2184         sk_stream_moderate_sndbuf(sk);
2185         return false;
2186 }
2187 EXPORT_SYMBOL(sk_page_frag_refill);
2188
2189 static void __lock_sock(struct sock *sk)
2190         __releases(&sk->sk_lock.slock)
2191         __acquires(&sk->sk_lock.slock)
2192 {
2193         DEFINE_WAIT(wait);
2194
2195         for (;;) {
2196                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2197                                         TASK_UNINTERRUPTIBLE);
2198                 spin_unlock_bh(&sk->sk_lock.slock);
2199                 schedule();
2200                 spin_lock_bh(&sk->sk_lock.slock);
2201                 if (!sock_owned_by_user(sk))
2202                         break;
2203         }
2204         finish_wait(&sk->sk_lock.wq, &wait);
2205 }
2206
2207 static void __release_sock(struct sock *sk)
2208         __releases(&sk->sk_lock.slock)
2209         __acquires(&sk->sk_lock.slock)
2210 {
2211         struct sk_buff *skb, *next;
2212
2213         while ((skb = sk->sk_backlog.head) != NULL) {
2214                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2215
2216                 spin_unlock_bh(&sk->sk_lock.slock);
2217
2218                 do {
2219                         next = skb->next;
2220                         prefetch(next);
2221                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2222                         skb->next = NULL;
2223                         sk_backlog_rcv(sk, skb);
2224
2225                         cond_resched();
2226
2227                         skb = next;
2228                 } while (skb != NULL);
2229
2230                 spin_lock_bh(&sk->sk_lock.slock);
2231         }
2232
2233         /*
2234          * Doing the zeroing here guarantee we can not loop forever
2235          * while a wild producer attempts to flood us.
2236          */
2237         sk->sk_backlog.len = 0;
2238 }
2239
2240 void __sk_flush_backlog(struct sock *sk)
2241 {
2242         spin_lock_bh(&sk->sk_lock.slock);
2243         __release_sock(sk);
2244         spin_unlock_bh(&sk->sk_lock.slock);
2245 }
2246
2247 /**
2248  * sk_wait_data - wait for data to arrive at sk_receive_queue
2249  * @sk:    sock to wait on
2250  * @timeo: for how long
2251  * @skb:   last skb seen on sk_receive_queue
2252  *
2253  * Now socket state including sk->sk_err is changed only under lock,
2254  * hence we may omit checks after joining wait queue.
2255  * We check receive queue before schedule() only as optimization;
2256  * it is very likely that release_sock() added new data.
2257  */
2258 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2259 {
2260         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2261         int rc;
2262
2263         add_wait_queue(sk_sleep(sk), &wait);
2264         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2265         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2266         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2267         remove_wait_queue(sk_sleep(sk), &wait);
2268         return rc;
2269 }
2270 EXPORT_SYMBOL(sk_wait_data);
2271
2272 /**
2273  *      __sk_mem_raise_allocated - increase memory_allocated
2274  *      @sk: socket
2275  *      @size: memory size to allocate
2276  *      @amt: pages to allocate
2277  *      @kind: allocation type
2278  *
2279  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2280  */
2281 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2282 {
2283         struct proto *prot = sk->sk_prot;
2284         long allocated = sk_memory_allocated_add(sk, amt);
2285
2286         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2287             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2288                 goto suppress_allocation;
2289
2290         /* Under limit. */
2291         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2292                 sk_leave_memory_pressure(sk);
2293                 return 1;
2294         }
2295
2296         /* Under pressure. */
2297         if (allocated > sk_prot_mem_limits(sk, 1))
2298                 sk_enter_memory_pressure(sk);
2299
2300         /* Over hard limit. */
2301         if (allocated > sk_prot_mem_limits(sk, 2))
2302                 goto suppress_allocation;
2303
2304         /* guarantee minimum buffer size under pressure */
2305         if (kind == SK_MEM_RECV) {
2306                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2307                         return 1;
2308
2309         } else { /* SK_MEM_SEND */
2310                 if (sk->sk_type == SOCK_STREAM) {
2311                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2312                                 return 1;
2313                 } else if (refcount_read(&sk->sk_wmem_alloc) <
2314                            prot->sysctl_wmem[0])
2315                                 return 1;
2316         }
2317
2318         if (sk_has_memory_pressure(sk)) {
2319                 int alloc;
2320
2321                 if (!sk_under_memory_pressure(sk))
2322                         return 1;
2323                 alloc = sk_sockets_allocated_read_positive(sk);
2324                 if (sk_prot_mem_limits(sk, 2) > alloc *
2325                     sk_mem_pages(sk->sk_wmem_queued +
2326                                  atomic_read(&sk->sk_rmem_alloc) +
2327                                  sk->sk_forward_alloc))
2328                         return 1;
2329         }
2330
2331 suppress_allocation:
2332
2333         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2334                 sk_stream_moderate_sndbuf(sk);
2335
2336                 /* Fail only if socket is _under_ its sndbuf.
2337                  * In this case we cannot block, so that we have to fail.
2338                  */
2339                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2340                         return 1;
2341         }
2342
2343         trace_sock_exceed_buf_limit(sk, prot, allocated);
2344
2345         sk_memory_allocated_sub(sk, amt);
2346
2347         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2348                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2349
2350         return 0;
2351 }
2352 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2353
2354 /**
2355  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2356  *      @sk: socket
2357  *      @size: memory size to allocate
2358  *      @kind: allocation type
2359  *
2360  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2361  *      rmem allocation. This function assumes that protocols which have
2362  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2363  */
2364 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2365 {
2366         int ret, amt = sk_mem_pages(size);
2367
2368         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2369         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2370         if (!ret)
2371                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2372         return ret;
2373 }
2374 EXPORT_SYMBOL(__sk_mem_schedule);
2375
2376 /**
2377  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2378  *      @sk: socket
2379  *      @amount: number of quanta
2380  *
2381  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2382  */
2383 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2384 {
2385         sk_memory_allocated_sub(sk, amount);
2386
2387         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2388                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2389
2390         if (sk_under_memory_pressure(sk) &&
2391             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2392                 sk_leave_memory_pressure(sk);
2393 }
2394 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2395
2396 /**
2397  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2398  *      @sk: socket
2399  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2400  */
2401 void __sk_mem_reclaim(struct sock *sk, int amount)
2402 {
2403         amount >>= SK_MEM_QUANTUM_SHIFT;
2404         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2405         __sk_mem_reduce_allocated(sk, amount);
2406 }
2407 EXPORT_SYMBOL(__sk_mem_reclaim);
2408
2409 int sk_set_peek_off(struct sock *sk, int val)
2410 {
2411         if (val < 0)
2412                 return -EINVAL;
2413
2414         sk->sk_peek_off = val;
2415         return 0;
2416 }
2417 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2418
2419 /*
2420  * Set of default routines for initialising struct proto_ops when
2421  * the protocol does not support a particular function. In certain
2422  * cases where it makes no sense for a protocol to have a "do nothing"
2423  * function, some default processing is provided.
2424  */
2425
2426 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2427 {
2428         return -EOPNOTSUPP;
2429 }
2430 EXPORT_SYMBOL(sock_no_bind);
2431
2432 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2433                     int len, int flags)
2434 {
2435         return -EOPNOTSUPP;
2436 }
2437 EXPORT_SYMBOL(sock_no_connect);
2438
2439 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2440 {
2441         return -EOPNOTSUPP;
2442 }
2443 EXPORT_SYMBOL(sock_no_socketpair);
2444
2445 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2446                    bool kern)
2447 {
2448         return -EOPNOTSUPP;
2449 }
2450 EXPORT_SYMBOL(sock_no_accept);
2451
2452 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2453                     int *len, int peer)
2454 {
2455         return -EOPNOTSUPP;
2456 }
2457 EXPORT_SYMBOL(sock_no_getname);
2458
2459 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2460 {
2461         return 0;
2462 }
2463 EXPORT_SYMBOL(sock_no_poll);
2464
2465 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2466 {
2467         return -EOPNOTSUPP;
2468 }
2469 EXPORT_SYMBOL(sock_no_ioctl);
2470
2471 int sock_no_listen(struct socket *sock, int backlog)
2472 {
2473         return -EOPNOTSUPP;
2474 }
2475 EXPORT_SYMBOL(sock_no_listen);
2476
2477 int sock_no_shutdown(struct socket *sock, int how)
2478 {
2479         return -EOPNOTSUPP;
2480 }
2481 EXPORT_SYMBOL(sock_no_shutdown);
2482
2483 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2484                     char __user *optval, unsigned int optlen)
2485 {
2486         return -EOPNOTSUPP;
2487 }
2488 EXPORT_SYMBOL(sock_no_setsockopt);
2489
2490 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2491                     char __user *optval, int __user *optlen)
2492 {
2493         return -EOPNOTSUPP;
2494 }
2495 EXPORT_SYMBOL(sock_no_getsockopt);
2496
2497 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2498 {
2499         return -EOPNOTSUPP;
2500 }
2501 EXPORT_SYMBOL(sock_no_sendmsg);
2502
2503 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2504 {
2505         return -EOPNOTSUPP;
2506 }
2507 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2508
2509 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2510                     int flags)
2511 {
2512         return -EOPNOTSUPP;
2513 }
2514 EXPORT_SYMBOL(sock_no_recvmsg);
2515
2516 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2517 {
2518         /* Mirror missing mmap method error code */
2519         return -ENODEV;
2520 }
2521 EXPORT_SYMBOL(sock_no_mmap);
2522
2523 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2524 {
2525         ssize_t res;
2526         struct msghdr msg = {.msg_flags = flags};
2527         struct kvec iov;
2528         char *kaddr = kmap(page);
2529         iov.iov_base = kaddr + offset;
2530         iov.iov_len = size;
2531         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2532         kunmap(page);
2533         return res;
2534 }
2535 EXPORT_SYMBOL(sock_no_sendpage);
2536
2537 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2538                                 int offset, size_t size, int flags)
2539 {
2540         ssize_t res;
2541         struct msghdr msg = {.msg_flags = flags};
2542         struct kvec iov;
2543         char *kaddr = kmap(page);
2544
2545         iov.iov_base = kaddr + offset;
2546         iov.iov_len = size;
2547         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2548         kunmap(page);
2549         return res;
2550 }
2551 EXPORT_SYMBOL(sock_no_sendpage_locked);
2552
2553 /*
2554  *      Default Socket Callbacks
2555  */
2556
2557 static void sock_def_wakeup(struct sock *sk)
2558 {
2559         struct socket_wq *wq;
2560
2561         rcu_read_lock();
2562         wq = rcu_dereference(sk->sk_wq);
2563         if (skwq_has_sleeper(wq))
2564                 wake_up_interruptible_all(&wq->wait);
2565         rcu_read_unlock();
2566 }
2567
2568 static void sock_def_error_report(struct sock *sk)
2569 {
2570         struct socket_wq *wq;
2571
2572         rcu_read_lock();
2573         wq = rcu_dereference(sk->sk_wq);
2574         if (skwq_has_sleeper(wq))
2575                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2576         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2577         rcu_read_unlock();
2578 }
2579
2580 static void sock_def_readable(struct sock *sk)
2581 {
2582         struct socket_wq *wq;
2583
2584         rcu_read_lock();
2585         wq = rcu_dereference(sk->sk_wq);
2586         if (skwq_has_sleeper(wq))
2587                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2588                                                 POLLRDNORM | POLLRDBAND);
2589         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2590         rcu_read_unlock();
2591 }
2592
2593 static void sock_def_write_space(struct sock *sk)
2594 {
2595         struct socket_wq *wq;
2596
2597         rcu_read_lock();
2598
2599         /* Do not wake up a writer until he can make "significant"
2600          * progress.  --DaveM
2601          */
2602         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2603                 wq = rcu_dereference(sk->sk_wq);
2604                 if (skwq_has_sleeper(wq))
2605                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2606                                                 POLLWRNORM | POLLWRBAND);
2607
2608                 /* Should agree with poll, otherwise some programs break */
2609                 if (sock_writeable(sk))
2610                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2611         }
2612
2613         rcu_read_unlock();
2614 }
2615
2616 static void sock_def_destruct(struct sock *sk)
2617 {
2618 }
2619
2620 void sk_send_sigurg(struct sock *sk)
2621 {
2622         if (sk->sk_socket && sk->sk_socket->file)
2623                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2624                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2625 }
2626 EXPORT_SYMBOL(sk_send_sigurg);
2627
2628 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2629                     unsigned long expires)
2630 {
2631         if (!mod_timer(timer, expires))
2632                 sock_hold(sk);
2633 }
2634 EXPORT_SYMBOL(sk_reset_timer);
2635
2636 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2637 {
2638         if (del_timer(timer))
2639                 __sock_put(sk);
2640 }
2641 EXPORT_SYMBOL(sk_stop_timer);
2642
2643 void sock_init_data(struct socket *sock, struct sock *sk)
2644 {
2645         sk_init_common(sk);
2646         sk->sk_send_head        =       NULL;
2647
2648         init_timer(&sk->sk_timer);
2649
2650         sk->sk_allocation       =       GFP_KERNEL;
2651         sk->sk_rcvbuf           =       sysctl_rmem_default;
2652         sk->sk_sndbuf           =       sysctl_wmem_default;
2653         sk->sk_state            =       TCP_CLOSE;
2654         sk_set_socket(sk, sock);
2655
2656         sock_set_flag(sk, SOCK_ZAPPED);
2657
2658         if (sock) {
2659                 sk->sk_type     =       sock->type;
2660                 sk->sk_wq       =       sock->wq;
2661                 sock->sk        =       sk;
2662                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2663         } else {
2664                 sk->sk_wq       =       NULL;
2665                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2666         }
2667
2668         rwlock_init(&sk->sk_callback_lock);
2669         if (sk->sk_kern_sock)
2670                 lockdep_set_class_and_name(
2671                         &sk->sk_callback_lock,
2672                         af_kern_callback_keys + sk->sk_family,
2673                         af_family_kern_clock_key_strings[sk->sk_family]);
2674         else
2675                 lockdep_set_class_and_name(
2676                         &sk->sk_callback_lock,
2677                         af_callback_keys + sk->sk_family,
2678                         af_family_clock_key_strings[sk->sk_family]);
2679
2680         sk->sk_state_change     =       sock_def_wakeup;
2681         sk->sk_data_ready       =       sock_def_readable;
2682         sk->sk_write_space      =       sock_def_write_space;
2683         sk->sk_error_report     =       sock_def_error_report;
2684         sk->sk_destruct         =       sock_def_destruct;
2685
2686         sk->sk_frag.page        =       NULL;
2687         sk->sk_frag.offset      =       0;
2688         sk->sk_peek_off         =       -1;
2689
2690         sk->sk_peer_pid         =       NULL;
2691         sk->sk_peer_cred        =       NULL;
2692         sk->sk_write_pending    =       0;
2693         sk->sk_rcvlowat         =       1;
2694         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2695         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2696
2697         sk->sk_stamp = SK_DEFAULT_STAMP;
2698
2699 #ifdef CONFIG_NET_RX_BUSY_POLL
2700         sk->sk_napi_id          =       0;
2701         sk->sk_ll_usec          =       sysctl_net_busy_read;
2702 #endif
2703
2704         sk->sk_max_pacing_rate = ~0U;
2705         sk->sk_pacing_rate = ~0U;
2706         sk->sk_incoming_cpu = -1;
2707         /*
2708          * Before updating sk_refcnt, we must commit prior changes to memory
2709          * (Documentation/RCU/rculist_nulls.txt for details)
2710          */
2711         smp_wmb();
2712         refcount_set(&sk->sk_refcnt, 1);
2713         atomic_set(&sk->sk_drops, 0);
2714 }
2715 EXPORT_SYMBOL(sock_init_data);
2716
2717 void lock_sock_nested(struct sock *sk, int subclass)
2718 {
2719         might_sleep();
2720         spin_lock_bh(&sk->sk_lock.slock);
2721         if (sk->sk_lock.owned)
2722                 __lock_sock(sk);
2723         sk->sk_lock.owned = 1;
2724         spin_unlock(&sk->sk_lock.slock);
2725         /*
2726          * The sk_lock has mutex_lock() semantics here:
2727          */
2728         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2729         local_bh_enable();
2730 }
2731 EXPORT_SYMBOL(lock_sock_nested);
2732
2733 void release_sock(struct sock *sk)
2734 {
2735         spin_lock_bh(&sk->sk_lock.slock);
2736         if (sk->sk_backlog.tail)
2737                 __release_sock(sk);
2738
2739         /* Warning : release_cb() might need to release sk ownership,
2740          * ie call sock_release_ownership(sk) before us.
2741          */
2742         if (sk->sk_prot->release_cb)
2743                 sk->sk_prot->release_cb(sk);
2744
2745         sock_release_ownership(sk);
2746         if (waitqueue_active(&sk->sk_lock.wq))
2747                 wake_up(&sk->sk_lock.wq);
2748         spin_unlock_bh(&sk->sk_lock.slock);
2749 }
2750 EXPORT_SYMBOL(release_sock);
2751
2752 /**
2753  * lock_sock_fast - fast version of lock_sock
2754  * @sk: socket
2755  *
2756  * This version should be used for very small section, where process wont block
2757  * return false if fast path is taken:
2758  *
2759  *   sk_lock.slock locked, owned = 0, BH disabled
2760  *
2761  * return true if slow path is taken:
2762  *
2763  *   sk_lock.slock unlocked, owned = 1, BH enabled
2764  */
2765 bool lock_sock_fast(struct sock *sk)
2766 {
2767         might_sleep();
2768         spin_lock_bh(&sk->sk_lock.slock);
2769
2770         if (!sk->sk_lock.owned)
2771                 /*
2772                  * Note : We must disable BH
2773                  */
2774                 return false;
2775
2776         __lock_sock(sk);
2777         sk->sk_lock.owned = 1;
2778         spin_unlock(&sk->sk_lock.slock);
2779         /*
2780          * The sk_lock has mutex_lock() semantics here:
2781          */
2782         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2783         local_bh_enable();
2784         return true;
2785 }
2786 EXPORT_SYMBOL(lock_sock_fast);
2787
2788 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2789 {
2790         struct timeval tv;
2791         if (!sock_flag(sk, SOCK_TIMESTAMP))
2792                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2793         tv = ktime_to_timeval(sk->sk_stamp);
2794         if (tv.tv_sec == -1)
2795                 return -ENOENT;
2796         if (tv.tv_sec == 0) {
2797                 sk->sk_stamp = ktime_get_real();
2798                 tv = ktime_to_timeval(sk->sk_stamp);
2799         }
2800         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2801 }
2802 EXPORT_SYMBOL(sock_get_timestamp);
2803
2804 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2805 {
2806         struct timespec ts;
2807         if (!sock_flag(sk, SOCK_TIMESTAMP))
2808                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2809         ts = ktime_to_timespec(sk->sk_stamp);
2810         if (ts.tv_sec == -1)
2811                 return -ENOENT;
2812         if (ts.tv_sec == 0) {
2813                 sk->sk_stamp = ktime_get_real();
2814                 ts = ktime_to_timespec(sk->sk_stamp);
2815         }
2816         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2817 }
2818 EXPORT_SYMBOL(sock_get_timestampns);
2819
2820 void sock_enable_timestamp(struct sock *sk, int flag)
2821 {
2822         if (!sock_flag(sk, flag)) {
2823                 unsigned long previous_flags = sk->sk_flags;
2824
2825                 sock_set_flag(sk, flag);
2826                 /*
2827                  * we just set one of the two flags which require net
2828                  * time stamping, but time stamping might have been on
2829                  * already because of the other one
2830                  */
2831                 if (sock_needs_netstamp(sk) &&
2832                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2833                         net_enable_timestamp();
2834         }
2835 }
2836
2837 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2838                        int level, int type)
2839 {
2840         struct sock_exterr_skb *serr;
2841         struct sk_buff *skb;
2842         int copied, err;
2843
2844         err = -EAGAIN;
2845         skb = sock_dequeue_err_skb(sk);
2846         if (skb == NULL)
2847                 goto out;
2848
2849         copied = skb->len;
2850         if (copied > len) {
2851                 msg->msg_flags |= MSG_TRUNC;
2852                 copied = len;
2853         }
2854         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2855         if (err)
2856                 goto out_free_skb;
2857
2858         sock_recv_timestamp(msg, sk, skb);
2859
2860         serr = SKB_EXT_ERR(skb);
2861         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2862
2863         msg->msg_flags |= MSG_ERRQUEUE;
2864         err = copied;
2865
2866 out_free_skb:
2867         kfree_skb(skb);
2868 out:
2869         return err;
2870 }
2871 EXPORT_SYMBOL(sock_recv_errqueue);
2872
2873 /*
2874  *      Get a socket option on an socket.
2875  *
2876  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2877  *      asynchronous errors should be reported by getsockopt. We assume
2878  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2879  */
2880 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2881                            char __user *optval, int __user *optlen)
2882 {
2883         struct sock *sk = sock->sk;
2884
2885         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2886 }
2887 EXPORT_SYMBOL(sock_common_getsockopt);
2888
2889 #ifdef CONFIG_COMPAT
2890 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2891                                   char __user *optval, int __user *optlen)
2892 {
2893         struct sock *sk = sock->sk;
2894
2895         if (sk->sk_prot->compat_getsockopt != NULL)
2896                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2897                                                       optval, optlen);
2898         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2899 }
2900 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2901 #endif
2902
2903 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2904                         int flags)
2905 {
2906         struct sock *sk = sock->sk;
2907         int addr_len = 0;
2908         int err;
2909
2910         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2911                                    flags & ~MSG_DONTWAIT, &addr_len);
2912         if (err >= 0)
2913                 msg->msg_namelen = addr_len;
2914         return err;
2915 }
2916 EXPORT_SYMBOL(sock_common_recvmsg);
2917
2918 /*
2919  *      Set socket options on an inet socket.
2920  */
2921 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2922                            char __user *optval, unsigned int optlen)
2923 {
2924         struct sock *sk = sock->sk;
2925
2926         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2927 }
2928 EXPORT_SYMBOL(sock_common_setsockopt);
2929
2930 #ifdef CONFIG_COMPAT
2931 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2932                                   char __user *optval, unsigned int optlen)
2933 {
2934         struct sock *sk = sock->sk;
2935
2936         if (sk->sk_prot->compat_setsockopt != NULL)
2937                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2938                                                       optval, optlen);
2939         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2940 }
2941 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2942 #endif
2943
2944 void sk_common_release(struct sock *sk)
2945 {
2946         if (sk->sk_prot->destroy)
2947                 sk->sk_prot->destroy(sk);
2948
2949         /*
2950          * Observation: when sock_common_release is called, processes have
2951          * no access to socket. But net still has.
2952          * Step one, detach it from networking:
2953          *
2954          * A. Remove from hash tables.
2955          */
2956
2957         sk->sk_prot->unhash(sk);
2958
2959         /*
2960          * In this point socket cannot receive new packets, but it is possible
2961          * that some packets are in flight because some CPU runs receiver and
2962          * did hash table lookup before we unhashed socket. They will achieve
2963          * receive queue and will be purged by socket destructor.
2964          *
2965          * Also we still have packets pending on receive queue and probably,
2966          * our own packets waiting in device queues. sock_destroy will drain
2967          * receive queue, but transmitted packets will delay socket destruction
2968          * until the last reference will be released.
2969          */
2970
2971         sock_orphan(sk);
2972
2973         xfrm_sk_free_policy(sk);
2974
2975         sk_refcnt_debug_release(sk);
2976
2977         sock_put(sk);
2978 }
2979 EXPORT_SYMBOL(sk_common_release);
2980
2981 void sk_get_meminfo(const struct sock *sk, u32 *mem)
2982 {
2983         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
2984
2985         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
2986         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
2987         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
2988         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
2989         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
2990         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
2991         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
2992         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
2993         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
2994 }
2995
2996 #ifdef CONFIG_PROC_FS
2997 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2998 struct prot_inuse {
2999         int val[PROTO_INUSE_NR];
3000 };
3001
3002 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3003
3004 #ifdef CONFIG_NET_NS
3005 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3006 {
3007         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
3008 }
3009 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3010
3011 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3012 {
3013         int cpu, idx = prot->inuse_idx;
3014         int res = 0;
3015
3016         for_each_possible_cpu(cpu)
3017                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
3018
3019         return res >= 0 ? res : 0;
3020 }
3021 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3022
3023 static int __net_init sock_inuse_init_net(struct net *net)
3024 {
3025         net->core.inuse = alloc_percpu(struct prot_inuse);
3026         return net->core.inuse ? 0 : -ENOMEM;
3027 }
3028
3029 static void __net_exit sock_inuse_exit_net(struct net *net)
3030 {
3031         free_percpu(net->core.inuse);
3032 }
3033
3034 static struct pernet_operations net_inuse_ops = {
3035         .init = sock_inuse_init_net,
3036         .exit = sock_inuse_exit_net,
3037 };
3038
3039 static __init int net_inuse_init(void)
3040 {
3041         if (register_pernet_subsys(&net_inuse_ops))
3042                 panic("Cannot initialize net inuse counters");
3043
3044         return 0;
3045 }
3046
3047 core_initcall(net_inuse_init);
3048 #else
3049 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
3050
3051 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3052 {
3053         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
3054 }
3055 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3056
3057 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3058 {
3059         int cpu, idx = prot->inuse_idx;
3060         int res = 0;
3061
3062         for_each_possible_cpu(cpu)
3063                 res += per_cpu(prot_inuse, cpu).val[idx];
3064
3065         return res >= 0 ? res : 0;
3066 }
3067 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3068 #endif
3069
3070 static void assign_proto_idx(struct proto *prot)
3071 {
3072         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3073
3074         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3075                 pr_err("PROTO_INUSE_NR exhausted\n");
3076                 return;
3077         }
3078
3079         set_bit(prot->inuse_idx, proto_inuse_idx);
3080 }
3081
3082 static void release_proto_idx(struct proto *prot)
3083 {
3084         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3085                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3086 }
3087 #else
3088 static inline void assign_proto_idx(struct proto *prot)
3089 {
3090 }
3091
3092 static inline void release_proto_idx(struct proto *prot)
3093 {
3094 }
3095 #endif
3096
3097 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3098 {
3099         if (!rsk_prot)
3100                 return;
3101         kfree(rsk_prot->slab_name);
3102         rsk_prot->slab_name = NULL;
3103         kmem_cache_destroy(rsk_prot->slab);
3104         rsk_prot->slab = NULL;
3105 }
3106
3107 static int req_prot_init(const struct proto *prot)
3108 {
3109         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3110
3111         if (!rsk_prot)
3112                 return 0;
3113
3114         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3115                                         prot->name);
3116         if (!rsk_prot->slab_name)
3117                 return -ENOMEM;
3118
3119         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3120                                            rsk_prot->obj_size, 0,
3121                                            prot->slab_flags, NULL);
3122
3123         if (!rsk_prot->slab) {
3124                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3125                         prot->name);
3126                 return -ENOMEM;
3127         }
3128         return 0;
3129 }
3130
3131 int proto_register(struct proto *prot, int alloc_slab)
3132 {
3133         if (alloc_slab) {
3134                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3135                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
3136                                         NULL);
3137
3138                 if (prot->slab == NULL) {
3139                         pr_crit("%s: Can't create sock SLAB cache!\n",
3140                                 prot->name);
3141                         goto out;
3142                 }
3143
3144                 if (req_prot_init(prot))
3145                         goto out_free_request_sock_slab;
3146
3147                 if (prot->twsk_prot != NULL) {
3148                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3149
3150                         if (prot->twsk_prot->twsk_slab_name == NULL)
3151                                 goto out_free_request_sock_slab;
3152
3153                         prot->twsk_prot->twsk_slab =
3154                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3155                                                   prot->twsk_prot->twsk_obj_size,
3156                                                   0,
3157                                                   prot->slab_flags,
3158                                                   NULL);
3159                         if (prot->twsk_prot->twsk_slab == NULL)
3160                                 goto out_free_timewait_sock_slab_name;
3161                 }
3162         }
3163
3164         mutex_lock(&proto_list_mutex);
3165         list_add(&prot->node, &proto_list);
3166         assign_proto_idx(prot);
3167         mutex_unlock(&proto_list_mutex);
3168         return 0;
3169
3170 out_free_timewait_sock_slab_name:
3171         kfree(prot->twsk_prot->twsk_slab_name);
3172 out_free_request_sock_slab:
3173         req_prot_cleanup(prot->rsk_prot);
3174
3175         kmem_cache_destroy(prot->slab);
3176         prot->slab = NULL;
3177 out:
3178         return -ENOBUFS;
3179 }
3180 EXPORT_SYMBOL(proto_register);
3181
3182 void proto_unregister(struct proto *prot)
3183 {
3184         mutex_lock(&proto_list_mutex);
3185         release_proto_idx(prot);
3186         list_del(&prot->node);
3187         mutex_unlock(&proto_list_mutex);
3188
3189         kmem_cache_destroy(prot->slab);
3190         prot->slab = NULL;
3191
3192         req_prot_cleanup(prot->rsk_prot);
3193
3194         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3195                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3196                 kfree(prot->twsk_prot->twsk_slab_name);
3197                 prot->twsk_prot->twsk_slab = NULL;
3198         }
3199 }
3200 EXPORT_SYMBOL(proto_unregister);
3201
3202 #ifdef CONFIG_PROC_FS
3203 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3204         __acquires(proto_list_mutex)
3205 {
3206         mutex_lock(&proto_list_mutex);
3207         return seq_list_start_head(&proto_list, *pos);
3208 }
3209
3210 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3211 {
3212         return seq_list_next(v, &proto_list, pos);
3213 }
3214
3215 static void proto_seq_stop(struct seq_file *seq, void *v)
3216         __releases(proto_list_mutex)
3217 {
3218         mutex_unlock(&proto_list_mutex);
3219 }
3220
3221 static char proto_method_implemented(const void *method)
3222 {
3223         return method == NULL ? 'n' : 'y';
3224 }
3225 static long sock_prot_memory_allocated(struct proto *proto)
3226 {
3227         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3228 }
3229
3230 static char *sock_prot_memory_pressure(struct proto *proto)
3231 {
3232         return proto->memory_pressure != NULL ?
3233         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3234 }
3235
3236 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3237 {
3238
3239         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3240                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3241                    proto->name,
3242                    proto->obj_size,
3243                    sock_prot_inuse_get(seq_file_net(seq), proto),
3244                    sock_prot_memory_allocated(proto),
3245                    sock_prot_memory_pressure(proto),
3246                    proto->max_header,
3247                    proto->slab == NULL ? "no" : "yes",
3248                    module_name(proto->owner),
3249                    proto_method_implemented(proto->close),
3250                    proto_method_implemented(proto->connect),
3251                    proto_method_implemented(proto->disconnect),
3252                    proto_method_implemented(proto->accept),
3253                    proto_method_implemented(proto->ioctl),
3254                    proto_method_implemented(proto->init),
3255                    proto_method_implemented(proto->destroy),
3256                    proto_method_implemented(proto->shutdown),
3257                    proto_method_implemented(proto->setsockopt),
3258                    proto_method_implemented(proto->getsockopt),
3259                    proto_method_implemented(proto->sendmsg),
3260                    proto_method_implemented(proto->recvmsg),
3261                    proto_method_implemented(proto->sendpage),
3262                    proto_method_implemented(proto->bind),
3263                    proto_method_implemented(proto->backlog_rcv),
3264                    proto_method_implemented(proto->hash),
3265                    proto_method_implemented(proto->unhash),
3266                    proto_method_implemented(proto->get_port),
3267                    proto_method_implemented(proto->enter_memory_pressure));
3268 }
3269
3270 static int proto_seq_show(struct seq_file *seq, void *v)
3271 {
3272         if (v == &proto_list)
3273                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3274                            "protocol",
3275                            "size",
3276                            "sockets",
3277                            "memory",
3278                            "press",
3279                            "maxhdr",
3280                            "slab",
3281                            "module",
3282                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3283         else
3284                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3285         return 0;
3286 }
3287
3288 static const struct seq_operations proto_seq_ops = {
3289         .start  = proto_seq_start,
3290         .next   = proto_seq_next,
3291         .stop   = proto_seq_stop,
3292         .show   = proto_seq_show,
3293 };
3294
3295 static int proto_seq_open(struct inode *inode, struct file *file)
3296 {
3297         return seq_open_net(inode, file, &proto_seq_ops,
3298                             sizeof(struct seq_net_private));
3299 }
3300
3301 static const struct file_operations proto_seq_fops = {
3302         .owner          = THIS_MODULE,
3303         .open           = proto_seq_open,
3304         .read           = seq_read,
3305         .llseek         = seq_lseek,
3306         .release        = seq_release_net,
3307 };
3308
3309 static __net_init int proto_init_net(struct net *net)
3310 {
3311         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3312                 return -ENOMEM;
3313
3314         return 0;
3315 }
3316
3317 static __net_exit void proto_exit_net(struct net *net)
3318 {
3319         remove_proc_entry("protocols", net->proc_net);
3320 }
3321
3322
3323 static __net_initdata struct pernet_operations proto_net_ops = {
3324         .init = proto_init_net,
3325         .exit = proto_exit_net,
3326 };
3327
3328 static int __init proto_init(void)
3329 {
3330         return register_pernet_subsys(&proto_net_ops);
3331 }
3332
3333 subsys_initcall(proto_init);
3334
3335 #endif /* PROC_FS */
3336
3337 #ifdef CONFIG_NET_RX_BUSY_POLL
3338 bool sk_busy_loop_end(void *p, unsigned long start_time)
3339 {
3340         struct sock *sk = p;
3341
3342         return !skb_queue_empty(&sk->sk_receive_queue) ||
3343                sk_busy_loop_timeout(sk, start_time);
3344 }
3345 EXPORT_SYMBOL(sk_busy_loop_end);
3346 #endif /* CONFIG_NET_RX_BUSY_POLL */