net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119
 120 #include "scm.h"
 121
 122 static atomic_long_t unix_nr_socks;
 123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 125
 126 /* SMP locking strategy:
 127  *    hash table is protected with spinlock.
 128  *    each socket state is protected by separate spinlock.
 129  */
 130
 131 static unsigned int unix_unbound_hash(struct sock *sk)
 132 {
 133         unsigned long hash = (unsigned long)sk;
 134
 135         hash ^= hash >> 16;
 136         hash ^= hash >> 8;
 137         hash ^= sk->sk_type;
 138
 139         return hash & UNIX_HASH_MOD;
 140 }
 141
 142 static unsigned int unix_bsd_hash(struct inode *i)
 143 {
 144         return i->i_ino & UNIX_HASH_MOD;
 145 }
 146
 147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 148                                        int addr_len, int type)
 149 {
 150         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 151         unsigned int hash;
 152
 153         hash = (__force unsigned int)csum_fold(csum);
 154         hash ^= hash >> 8;
 155         hash ^= type;
 156
 157         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 158 }
 159
 160 static void unix_table_double_lock(struct net *net,
 161                                    unsigned int hash1, unsigned int hash2)
 162 {
 163         if (hash1 == hash2) {
 164                 spin_lock(&net->unx.table.locks[hash1]);
 165                 return;
 166         }
 167
 168         if (hash1 > hash2)
 169                 swap(hash1, hash2);
 170
 171         spin_lock(&net->unx.table.locks[hash1]);
 172         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 173 }
 174
 175 static void unix_table_double_unlock(struct net *net,
 176                                      unsigned int hash1, unsigned int hash2)
 177 {
 178         if (hash1 == hash2) {
 179                 spin_unlock(&net->unx.table.locks[hash1]);
 180                 return;
 181         }
 182
 183         spin_unlock(&net->unx.table.locks[hash1]);
 184         spin_unlock(&net->unx.table.locks[hash2]);
 185 }
 186
 187 #ifdef CONFIG_SECURITY_NETWORK
 188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 189 {
 190         UNIXCB(skb).secid = scm->secid;
 191 }
 192
 193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 194 {
 195         scm->secid = UNIXCB(skb).secid;
 196 }
 197
 198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 199 {
 200         return (scm->secid == UNIXCB(skb).secid);
 201 }
 202 #else
 203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 204 { }
 205
 206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 207 { }
 208
 209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 210 {
 211         return true;
 212 }
 213 #endif /* CONFIG_SECURITY_NETWORK */
 214
 215 #define unix_peer(sk) (unix_sk(sk)->peer)
 216
 217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 218 {
 219         return unix_peer(osk) == sk;
 220 }
 221
 222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 223 {
 224         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 225 }
 226
 227 static inline int unix_recvq_full(const struct sock *sk)
 228 {
 229         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 230 }
 231
 232 static inline int unix_recvq_full_lockless(const struct sock *sk)
 233 {
 234         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 235                 READ_ONCE(sk->sk_max_ack_backlog);
 236 }
 237
 238 struct sock *unix_peer_get(struct sock *s)
 239 {
 240         struct sock *peer;
 241
 242         unix_state_lock(s);
 243         peer = unix_peer(s);
 244         if (peer)
 245                 sock_hold(peer);
 246         unix_state_unlock(s);
 247         return peer;
 248 }
 249 EXPORT_SYMBOL_GPL(unix_peer_get);
 250
 251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 252                                              int addr_len)
 253 {
 254         struct unix_address *addr;
 255
 256         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 257         if (!addr)
 258                 return NULL;
 259
 260         refcount_set(&addr->refcnt, 1);
 261         addr->len = addr_len;
 262         memcpy(addr->name, sunaddr, addr_len);
 263
 264         return addr;
 265 }
 266
 267 static inline void unix_release_addr(struct unix_address *addr)
 268 {
 269         if (refcount_dec_and_test(&addr->refcnt))
 270                 kfree(addr);
 271 }
 272
 273 /*
 274  *      Check unix socket name:
 275  *              - should be not zero length.
 276  *              - if started by not zero, should be NULL terminated (FS object)
 277  *              - if started by zero, it is abstract name.
 278  */
 279
 280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 281 {
 282         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 283             addr_len > sizeof(*sunaddr))
 284                 return -EINVAL;
 285
 286         if (sunaddr->sun_family != AF_UNIX)
 287                 return -EINVAL;
 288
 289         return 0;
 290 }
 291
 292 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 293 {
 294         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 295         short offset = offsetof(struct sockaddr_storage, __data);
 296
 297         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 298
 299         /* This may look like an off by one error but it is a bit more
 300          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 301          * sun_path[108] doesn't as such exist.  However in kernel space
 302          * we are guaranteed that it is a valid memory location in our
 303          * kernel address buffer because syscall functions always pass
 304          * a pointer of struct sockaddr_storage which has a bigger buffer
 305          * than 108.  Also, we must terminate sun_path for strlen() in
 306          * getname_kernel().
 307          */
 308         addr->__data[addr_len - offset] = 0;
 309
 310         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 311          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 312          * know the actual buffer.
 313          */
 314         return strlen(addr->__data) + offset + 1;
 315 }
 316
 317 static void __unix_remove_socket(struct sock *sk)
 318 {
 319         sk_del_node_init(sk);
 320 }
 321
 322 static void __unix_insert_socket(struct net *net, struct sock *sk)
 323 {
 324         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 325         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 326 }
 327
 328 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 329                                  struct unix_address *addr, unsigned int hash)
 330 {
 331         __unix_remove_socket(sk);
 332         smp_store_release(&unix_sk(sk)->addr, addr);
 333
 334         sk->sk_hash = hash;
 335         __unix_insert_socket(net, sk);
 336 }
 337
 338 static void unix_remove_socket(struct net *net, struct sock *sk)
 339 {
 340         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 341         __unix_remove_socket(sk);
 342         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 343 }
 344
 345 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 346 {
 347         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 348         __unix_insert_socket(net, sk);
 349         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 350 }
 351
 352 static void unix_insert_bsd_socket(struct sock *sk)
 353 {
 354         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 355         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 356         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 357 }
 358
 359 static void unix_remove_bsd_socket(struct sock *sk)
 360 {
 361         if (!hlist_unhashed(&sk->sk_bind_node)) {
 362                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 363                 __sk_del_bind_node(sk);
 364                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 365
 366                 sk_node_init(&sk->sk_bind_node);
 367         }
 368 }
 369
 370 static struct sock *__unix_find_socket_byname(struct net *net,
 371                                               struct sockaddr_un *sunname,
 372                                               int len, unsigned int hash)
 373 {
 374         struct sock *s;
 375
 376         sk_for_each(s, &net->unx.table.buckets[hash]) {
 377                 struct unix_sock *u = unix_sk(s);
 378
 379                 if (u->addr->len == len &&
 380                     !memcmp(u->addr->name, sunname, len))
 381                         return s;
 382         }
 383         return NULL;
 384 }
 385
 386 static inline struct sock *unix_find_socket_byname(struct net *net,
 387                                                    struct sockaddr_un *sunname,
 388                                                    int len, unsigned int hash)
 389 {
 390         struct sock *s;
 391
 392         spin_lock(&net->unx.table.locks[hash]);
 393         s = __unix_find_socket_byname(net, sunname, len, hash);
 394         if (s)
 395                 sock_hold(s);
 396         spin_unlock(&net->unx.table.locks[hash]);
 397         return s;
 398 }
 399
 400 static struct sock *unix_find_socket_byinode(struct inode *i)
 401 {
 402         unsigned int hash = unix_bsd_hash(i);
 403         struct sock *s;
 404
 405         spin_lock(&bsd_socket_locks[hash]);
 406         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 407                 struct dentry *dentry = unix_sk(s)->path.dentry;
 408
 409                 if (dentry && d_backing_inode(dentry) == i) {
 410                         sock_hold(s);
 411                         spin_unlock(&bsd_socket_locks[hash]);
 412                         return s;
 413                 }
 414         }
 415         spin_unlock(&bsd_socket_locks[hash]);
 416         return NULL;
 417 }
 418
 419 /* Support code for asymmetrically connected dgram sockets
 420  *
 421  * If a datagram socket is connected to a socket not itself connected
 422  * to the first socket (eg, /dev/log), clients may only enqueue more
 423  * messages if the present receive queue of the server socket is not
 424  * "too large". This means there's a second writeability condition
 425  * poll and sendmsg need to test. The dgram recv code will do a wake
 426  * up on the peer_wait wait queue of a socket upon reception of a
 427  * datagram which needs to be propagated to sleeping would-be writers
 428  * since these might not have sent anything so far. This can't be
 429  * accomplished via poll_wait because the lifetime of the server
 430  * socket might be less than that of its clients if these break their
 431  * association with it or if the server socket is closed while clients
 432  * are still connected to it and there's no way to inform "a polling
 433  * implementation" that it should let go of a certain wait queue
 434  *
 435  * In order to propagate a wake up, a wait_queue_entry_t of the client
 436  * socket is enqueued on the peer_wait queue of the server socket
 437  * whose wake function does a wake_up on the ordinary client socket
 438  * wait queue. This connection is established whenever a write (or
 439  * poll for write) hit the flow control condition and broken when the
 440  * association to the server socket is dissolved or after a wake up
 441  * was relayed.
 442  */
 443
 444 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 445                                       void *key)
 446 {
 447         struct unix_sock *u;
 448         wait_queue_head_t *u_sleep;
 449
 450         u = container_of(q, struct unix_sock, peer_wake);
 451
 452         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 453                             q);
 454         u->peer_wake.private = NULL;
 455
 456         /* relaying can only happen while the wq still exists */
 457         u_sleep = sk_sleep(&u->sk);
 458         if (u_sleep)
 459                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 460
 461         return 0;
 462 }
 463
 464 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 465 {
 466         struct unix_sock *u, *u_other;
 467         int rc;
 468
 469         u = unix_sk(sk);
 470         u_other = unix_sk(other);
 471         rc = 0;
 472         spin_lock(&u_other->peer_wait.lock);
 473
 474         if (!u->peer_wake.private) {
 475                 u->peer_wake.private = other;
 476                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 477
 478                 rc = 1;
 479         }
 480
 481         spin_unlock(&u_other->peer_wait.lock);
 482         return rc;
 483 }
 484
 485 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 486                                             struct sock *other)
 487 {
 488         struct unix_sock *u, *u_other;
 489
 490         u = unix_sk(sk);
 491         u_other = unix_sk(other);
 492         spin_lock(&u_other->peer_wait.lock);
 493
 494         if (u->peer_wake.private == other) {
 495                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 496                 u->peer_wake.private = NULL;
 497         }
 498
 499         spin_unlock(&u_other->peer_wait.lock);
 500 }
 501
 502 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 503                                                    struct sock *other)
 504 {
 505         unix_dgram_peer_wake_disconnect(sk, other);
 506         wake_up_interruptible_poll(sk_sleep(sk),
 507                                    EPOLLOUT |
 508                                    EPOLLWRNORM |
 509                                    EPOLLWRBAND);
 510 }
 511
 512 /* preconditions:
 513  *      - unix_peer(sk) == other
 514  *      - association is stable
 515  */
 516 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 517 {
 518         int connected;
 519
 520         connected = unix_dgram_peer_wake_connect(sk, other);
 521
 522         /* If other is SOCK_DEAD, we want to make sure we signal
 523          * POLLOUT, such that a subsequent write() can get a
 524          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 525          * to other and its full, we will hang waiting for POLLOUT.
 526          */
 527         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 528                 return 1;
 529
 530         if (connected)
 531                 unix_dgram_peer_wake_disconnect(sk, other);
 532
 533         return 0;
 534 }
 535
 536 static int unix_writable(const struct sock *sk)
 537 {
 538         return sk->sk_state != TCP_LISTEN &&
 539                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 540 }
 541
 542 static void unix_write_space(struct sock *sk)
 543 {
 544         struct socket_wq *wq;
 545
 546         rcu_read_lock();
 547         if (unix_writable(sk)) {
 548                 wq = rcu_dereference(sk->sk_wq);
 549                 if (skwq_has_sleeper(wq))
 550                         wake_up_interruptible_sync_poll(&wq->wait,
 551                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 552                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 553         }
 554         rcu_read_unlock();
 555 }
 556
 557 /* When dgram socket disconnects (or changes its peer), we clear its receive
 558  * queue of packets arrived from previous peer. First, it allows to do
 559  * flow control based only on wmem_alloc; second, sk connected to peer
 560  * may receive messages only from that peer. */
 561 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 562 {
 563         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 564                 skb_queue_purge(&sk->sk_receive_queue);
 565                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 566
 567                 /* If one link of bidirectional dgram pipe is disconnected,
 568                  * we signal error. Messages are lost. Do not make this,
 569                  * when peer was not connected to us.
 570                  */
 571                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 572                         WRITE_ONCE(other->sk_err, ECONNRESET);
 573                         sk_error_report(other);
 574                 }
 575         }
 576         other->sk_state = TCP_CLOSE;
 577 }
 578
 579 static void unix_sock_destructor(struct sock *sk)
 580 {
 581         struct unix_sock *u = unix_sk(sk);
 582
 583         skb_queue_purge(&sk->sk_receive_queue);
 584
 585         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 586         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 587         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 588         if (!sock_flag(sk, SOCK_DEAD)) {
 589                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 590                 return;
 591         }
 592
 593         if (u->addr)
 594                 unix_release_addr(u->addr);
 595
 596         atomic_long_dec(&unix_nr_socks);
 597         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 598 #ifdef UNIX_REFCNT_DEBUG
 599         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 600                 atomic_long_read(&unix_nr_socks));
 601 #endif
 602 }
 603
 604 static void unix_release_sock(struct sock *sk, int embrion)
 605 {
 606         struct unix_sock *u = unix_sk(sk);
 607         struct sock *skpair;
 608         struct sk_buff *skb;
 609         struct path path;
 610         int state;
 611
 612         unix_remove_socket(sock_net(sk), sk);
 613         unix_remove_bsd_socket(sk);
 614
 615         /* Clear state */
 616         unix_state_lock(sk);
 617         sock_orphan(sk);
 618         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 619         path         = u->path;
 620         u->path.dentry = NULL;
 621         u->path.mnt = NULL;
 622         state = sk->sk_state;
 623         sk->sk_state = TCP_CLOSE;
 624
 625         skpair = unix_peer(sk);
 626         unix_peer(sk) = NULL;
 627
 628         unix_state_unlock(sk);
 629
 630 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 631         if (u->oob_skb) {
 632                 kfree_skb(u->oob_skb);
 633                 u->oob_skb = NULL;
 634         }
 635 #endif
 636
 637         wake_up_interruptible_all(&u->peer_wait);
 638
 639         if (skpair != NULL) {
 640                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 641                         unix_state_lock(skpair);
 642                         /* No more writes */
 643                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 644                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 645                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 646                         unix_state_unlock(skpair);
 647                         skpair->sk_state_change(skpair);
 648                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 649                 }
 650
 651                 unix_dgram_peer_wake_disconnect(sk, skpair);
 652                 sock_put(skpair); /* It may now die */
 653         }
 654
 655         /* Try to flush out this socket. Throw out buffers at least */
 656
 657         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 658                 if (state == TCP_LISTEN)
 659                         unix_release_sock(skb->sk, 1);
 660                 /* passed fds are erased in the kfree_skb hook        */
 661                 UNIXCB(skb).consumed = skb->len;
 662                 kfree_skb(skb);
 663         }
 664
 665         if (path.dentry)
 666                 path_put(&path);
 667
 668         sock_put(sk);
 669
 670         /* ---- Socket is dead now and most probably destroyed ---- */
 671
 672         /*
 673          * Fixme: BSD difference: In BSD all sockets connected to us get
 674          *        ECONNRESET and we die on the spot. In Linux we behave
 675          *        like files and pipes do and wait for the last
 676          *        dereference.
 677          *
 678          * Can't we simply set sock->err?
 679          *
 680          *        What the above comment does talk about? --ANK(980817)
 681          */
 682
 683         if (READ_ONCE(unix_tot_inflight))
 684                 unix_gc();              /* Garbage collect fds */
 685 }
 686
 687 static void init_peercred(struct sock *sk)
 688 {
 689         const struct cred *old_cred;
 690         struct pid *old_pid;
 691
 692         spin_lock(&sk->sk_peer_lock);
 693         old_pid = sk->sk_peer_pid;
 694         old_cred = sk->sk_peer_cred;
 695         sk->sk_peer_pid  = get_pid(task_tgid(current));
 696         sk->sk_peer_cred = get_current_cred();
 697         spin_unlock(&sk->sk_peer_lock);
 698
 699         put_pid(old_pid);
 700         put_cred(old_cred);
 701 }
 702
 703 static void copy_peercred(struct sock *sk, struct sock *peersk)
 704 {
 705         const struct cred *old_cred;
 706         struct pid *old_pid;
 707
 708         if (sk < peersk) {
 709                 spin_lock(&sk->sk_peer_lock);
 710                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 711         } else {
 712                 spin_lock(&peersk->sk_peer_lock);
 713                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 714         }
 715         old_pid = sk->sk_peer_pid;
 716         old_cred = sk->sk_peer_cred;
 717         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 718         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 719
 720         spin_unlock(&sk->sk_peer_lock);
 721         spin_unlock(&peersk->sk_peer_lock);
 722
 723         put_pid(old_pid);
 724         put_cred(old_cred);
 725 }
 726
 727 static int unix_listen(struct socket *sock, int backlog)
 728 {
 729         int err;
 730         struct sock *sk = sock->sk;
 731         struct unix_sock *u = unix_sk(sk);
 732
 733         err = -EOPNOTSUPP;
 734         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 735                 goto out;       /* Only stream/seqpacket sockets accept */
 736         err = -EINVAL;
 737         if (!u->addr)
 738                 goto out;       /* No listens on an unbound socket */
 739         unix_state_lock(sk);
 740         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 741                 goto out_unlock;
 742         if (backlog > sk->sk_max_ack_backlog)
 743                 wake_up_interruptible_all(&u->peer_wait);
 744         sk->sk_max_ack_backlog  = backlog;
 745         sk->sk_state            = TCP_LISTEN;
 746         /* set credentials so connect can copy them */
 747         init_peercred(sk);
 748         err = 0;
 749
 750 out_unlock:
 751         unix_state_unlock(sk);
 752 out:
 753         return err;
 754 }
 755
 756 static int unix_release(struct socket *);
 757 static int unix_bind(struct socket *, struct sockaddr *, int);
 758 static int unix_stream_connect(struct socket *, struct sockaddr *,
 759                                int addr_len, int flags);
 760 static int unix_socketpair(struct socket *, struct socket *);
 761 static int unix_accept(struct socket *, struct socket *, int, bool);
 762 static int unix_getname(struct socket *, struct sockaddr *, int);
 763 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 764 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 765                                     poll_table *);
 766 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 767 #ifdef CONFIG_COMPAT
 768 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 769 #endif
 770 static int unix_shutdown(struct socket *, int);
 771 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 772 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 773 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 774                                        struct pipe_inode_info *, size_t size,
 775                                        unsigned int flags);
 776 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 777 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 778 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 779 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 780 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 781                               int, int);
 782 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 783 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 784                                   int);
 785
 786 static int unix_set_peek_off(struct sock *sk, int val)
 787 {
 788         struct unix_sock *u = unix_sk(sk);
 789
 790         if (mutex_lock_interruptible(&u->iolock))
 791                 return -EINTR;
 792
 793         WRITE_ONCE(sk->sk_peek_off, val);
 794         mutex_unlock(&u->iolock);
 795
 796         return 0;
 797 }
 798
 799 #ifdef CONFIG_PROC_FS
 800 static int unix_count_nr_fds(struct sock *sk)
 801 {
 802         struct sk_buff *skb;
 803         struct unix_sock *u;
 804         int nr_fds = 0;
 805
 806         spin_lock(&sk->sk_receive_queue.lock);
 807         skb = skb_peek(&sk->sk_receive_queue);
 808         while (skb) {
 809                 u = unix_sk(skb->sk);
 810                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 811                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 812         }
 813         spin_unlock(&sk->sk_receive_queue.lock);
 814
 815         return nr_fds;
 816 }
 817
 818 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 819 {
 820         struct sock *sk = sock->sk;
 821         unsigned char s_state;
 822         struct unix_sock *u;
 823         int nr_fds = 0;
 824
 825         if (sk) {
 826                 s_state = READ_ONCE(sk->sk_state);
 827                 u = unix_sk(sk);
 828
 829                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 830                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 831                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 832                  */
 833                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 834                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 835                 else if (s_state == TCP_LISTEN)
 836                         nr_fds = unix_count_nr_fds(sk);
 837
 838                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 839         }
 840 }
 841 #else
 842 #define unix_show_fdinfo NULL
 843 #endif
 844
 845 static const struct proto_ops unix_stream_ops = {
 846         .family =       PF_UNIX,
 847         .owner =        THIS_MODULE,
 848         .release =      unix_release,
 849         .bind =         unix_bind,
 850         .connect =      unix_stream_connect,
 851         .socketpair =   unix_socketpair,
 852         .accept =       unix_accept,
 853         .getname =      unix_getname,
 854         .poll =         unix_poll,
 855         .ioctl =        unix_ioctl,
 856 #ifdef CONFIG_COMPAT
 857         .compat_ioctl = unix_compat_ioctl,
 858 #endif
 859         .listen =       unix_listen,
 860         .shutdown =     unix_shutdown,
 861         .sendmsg =      unix_stream_sendmsg,
 862         .recvmsg =      unix_stream_recvmsg,
 863         .read_skb =     unix_stream_read_skb,
 864         .mmap =         sock_no_mmap,
 865         .splice_read =  unix_stream_splice_read,
 866         .set_peek_off = unix_set_peek_off,
 867         .show_fdinfo =  unix_show_fdinfo,
 868 };
 869
 870 static const struct proto_ops unix_dgram_ops = {
 871         .family =       PF_UNIX,
 872         .owner =        THIS_MODULE,
 873         .release =      unix_release,
 874         .bind =         unix_bind,
 875         .connect =      unix_dgram_connect,
 876         .socketpair =   unix_socketpair,
 877         .accept =       sock_no_accept,
 878         .getname =      unix_getname,
 879         .poll =         unix_dgram_poll,
 880         .ioctl =        unix_ioctl,
 881 #ifdef CONFIG_COMPAT
 882         .compat_ioctl = unix_compat_ioctl,
 883 #endif
 884         .listen =       sock_no_listen,
 885         .shutdown =     unix_shutdown,
 886         .sendmsg =      unix_dgram_sendmsg,
 887         .read_skb =     unix_read_skb,
 888         .recvmsg =      unix_dgram_recvmsg,
 889         .mmap =         sock_no_mmap,
 890         .set_peek_off = unix_set_peek_off,
 891         .show_fdinfo =  unix_show_fdinfo,
 892 };
 893
 894 static const struct proto_ops unix_seqpacket_ops = {
 895         .family =       PF_UNIX,
 896         .owner =        THIS_MODULE,
 897         .release =      unix_release,
 898         .bind =         unix_bind,
 899         .connect =      unix_stream_connect,
 900         .socketpair =   unix_socketpair,
 901         .accept =       unix_accept,
 902         .getname =      unix_getname,
 903         .poll =         unix_dgram_poll,
 904         .ioctl =        unix_ioctl,
 905 #ifdef CONFIG_COMPAT
 906         .compat_ioctl = unix_compat_ioctl,
 907 #endif
 908         .listen =       unix_listen,
 909         .shutdown =     unix_shutdown,
 910         .sendmsg =      unix_seqpacket_sendmsg,
 911         .recvmsg =      unix_seqpacket_recvmsg,
 912         .mmap =         sock_no_mmap,
 913         .set_peek_off = unix_set_peek_off,
 914         .show_fdinfo =  unix_show_fdinfo,
 915 };
 916
 917 static void unix_close(struct sock *sk, long timeout)
 918 {
 919         /* Nothing to do here, unix socket does not need a ->close().
 920          * This is merely for sockmap.
 921          */
 922 }
 923
 924 static void unix_unhash(struct sock *sk)
 925 {
 926         /* Nothing to do here, unix socket does not need a ->unhash().
 927          * This is merely for sockmap.
 928          */
 929 }
 930
 931 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 932 {
 933         if (level == SOL_SOCKET) {
 934                 switch (optname) {
 935                 case SO_PEERPIDFD:
 936                         return true;
 937                 default:
 938                         return false;
 939                 }
 940         }
 941
 942         return false;
 943 }
 944
 945 struct proto unix_dgram_proto = {
 946         .name                   = "UNIX",
 947         .owner                  = THIS_MODULE,
 948         .obj_size               = sizeof(struct unix_sock),
 949         .close                  = unix_close,
 950         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 951 #ifdef CONFIG_BPF_SYSCALL
 952         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 953 #endif
 954 };
 955
 956 struct proto unix_stream_proto = {
 957         .name                   = "UNIX-STREAM",
 958         .owner                  = THIS_MODULE,
 959         .obj_size               = sizeof(struct unix_sock),
 960         .close                  = unix_close,
 961         .unhash                 = unix_unhash,
 962         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 963 #ifdef CONFIG_BPF_SYSCALL
 964         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 965 #endif
 966 };
 967
 968 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 969 {
 970         struct unix_sock *u;
 971         struct sock *sk;
 972         int err;
 973
 974         atomic_long_inc(&unix_nr_socks);
 975         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 976                 err = -ENFILE;
 977                 goto err;
 978         }
 979
 980         if (type == SOCK_STREAM)
 981                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 982         else /*dgram and  seqpacket */
 983                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 984
 985         if (!sk) {
 986                 err = -ENOMEM;
 987                 goto err;
 988         }
 989
 990         sock_init_data(sock, sk);
 991
 992         sk->sk_hash             = unix_unbound_hash(sk);
 993         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 994         sk->sk_write_space      = unix_write_space;
 995         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 996         sk->sk_destruct         = unix_sock_destructor;
 997         u         = unix_sk(sk);
 998         u->path.dentry = NULL;
 999         u->path.mnt = NULL;
1000         spin_lock_init(&u->lock);
1001         atomic_long_set(&u->inflight, 0);
1002         INIT_LIST_HEAD(&u->link);
1003         mutex_init(&u->iolock); /* single task reading lock */
1004         mutex_init(&u->bindlock); /* single task binding lock */
1005         init_waitqueue_head(&u->peer_wait);
1006         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1007         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1008         unix_insert_unbound_socket(net, sk);
1009
1010         sock_prot_inuse_add(net, sk->sk_prot, 1);
1011
1012         return sk;
1013
1014 err:
1015         atomic_long_dec(&unix_nr_socks);
1016         return ERR_PTR(err);
1017 }
1018
1019 static int unix_create(struct net *net, struct socket *sock, int protocol,
1020                        int kern)
1021 {
1022         struct sock *sk;
1023
1024         if (protocol && protocol != PF_UNIX)
1025                 return -EPROTONOSUPPORT;
1026
1027         sock->state = SS_UNCONNECTED;
1028
1029         switch (sock->type) {
1030         case SOCK_STREAM:
1031                 sock->ops = &unix_stream_ops;
1032                 break;
1033                 /*
1034                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1035                  *      nothing uses it.
1036                  */
1037         case SOCK_RAW:
1038                 sock->type = SOCK_DGRAM;
1039                 fallthrough;
1040         case SOCK_DGRAM:
1041                 sock->ops = &unix_dgram_ops;
1042                 break;
1043         case SOCK_SEQPACKET:
1044                 sock->ops = &unix_seqpacket_ops;
1045                 break;
1046         default:
1047                 return -ESOCKTNOSUPPORT;
1048         }
1049
1050         sk = unix_create1(net, sock, kern, sock->type);
1051         if (IS_ERR(sk))
1052                 return PTR_ERR(sk);
1053
1054         return 0;
1055 }
1056
1057 static int unix_release(struct socket *sock)
1058 {
1059         struct sock *sk = sock->sk;
1060
1061         if (!sk)
1062                 return 0;
1063
1064         sk->sk_prot->close(sk, 0);
1065         unix_release_sock(sk, 0);
1066         sock->sk = NULL;
1067
1068         return 0;
1069 }
1070
1071 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1072                                   int type)
1073 {
1074         struct inode *inode;
1075         struct path path;
1076         struct sock *sk;
1077         int err;
1078
1079         unix_mkname_bsd(sunaddr, addr_len);
1080         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1081         if (err)
1082                 goto fail;
1083
1084         err = path_permission(&path, MAY_WRITE);
1085         if (err)
1086                 goto path_put;
1087
1088         err = -ECONNREFUSED;
1089         inode = d_backing_inode(path.dentry);
1090         if (!S_ISSOCK(inode->i_mode))
1091                 goto path_put;
1092
1093         sk = unix_find_socket_byinode(inode);
1094         if (!sk)
1095                 goto path_put;
1096
1097         err = -EPROTOTYPE;
1098         if (sk->sk_type == type)
1099                 touch_atime(&path);
1100         else
1101                 goto sock_put;
1102
1103         path_put(&path);
1104
1105         return sk;
1106
1107 sock_put:
1108         sock_put(sk);
1109 path_put:
1110         path_put(&path);
1111 fail:
1112         return ERR_PTR(err);
1113 }
1114
1115 static struct sock *unix_find_abstract(struct net *net,
1116                                        struct sockaddr_un *sunaddr,
1117                                        int addr_len, int type)
1118 {
1119         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1120         struct dentry *dentry;
1121         struct sock *sk;
1122
1123         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1124         if (!sk)
1125                 return ERR_PTR(-ECONNREFUSED);
1126
1127         dentry = unix_sk(sk)->path.dentry;
1128         if (dentry)
1129                 touch_atime(&unix_sk(sk)->path);
1130
1131         return sk;
1132 }
1133
1134 static struct sock *unix_find_other(struct net *net,
1135                                     struct sockaddr_un *sunaddr,
1136                                     int addr_len, int type)
1137 {
1138         struct sock *sk;
1139
1140         if (sunaddr->sun_path[0])
1141                 sk = unix_find_bsd(sunaddr, addr_len, type);
1142         else
1143                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1144
1145         return sk;
1146 }
1147
1148 static int unix_autobind(struct sock *sk)
1149 {
1150         unsigned int new_hash, old_hash = sk->sk_hash;
1151         struct unix_sock *u = unix_sk(sk);
1152         struct net *net = sock_net(sk);
1153         struct unix_address *addr;
1154         u32 lastnum, ordernum;
1155         int err;
1156
1157         err = mutex_lock_interruptible(&u->bindlock);
1158         if (err)
1159                 return err;
1160
1161         if (u->addr)
1162                 goto out;
1163
1164         err = -ENOMEM;
1165         addr = kzalloc(sizeof(*addr) +
1166                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1167         if (!addr)
1168                 goto out;
1169
1170         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1171         addr->name->sun_family = AF_UNIX;
1172         refcount_set(&addr->refcnt, 1);
1173
1174         ordernum = get_random_u32();
1175         lastnum = ordernum & 0xFFFFF;
1176 retry:
1177         ordernum = (ordernum + 1) & 0xFFFFF;
1178         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1179
1180         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1181         unix_table_double_lock(net, old_hash, new_hash);
1182
1183         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1184                 unix_table_double_unlock(net, old_hash, new_hash);
1185
1186                 /* __unix_find_socket_byname() may take long time if many names
1187                  * are already in use.
1188                  */
1189                 cond_resched();
1190
1191                 if (ordernum == lastnum) {
1192                         /* Give up if all names seems to be in use. */
1193                         err = -ENOSPC;
1194                         unix_release_addr(addr);
1195                         goto out;
1196                 }
1197
1198                 goto retry;
1199         }
1200
1201         __unix_set_addr_hash(net, sk, addr, new_hash);
1202         unix_table_double_unlock(net, old_hash, new_hash);
1203         err = 0;
1204
1205 out:    mutex_unlock(&u->bindlock);
1206         return err;
1207 }
1208
1209 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1210                          int addr_len)
1211 {
1212         umode_t mode = S_IFSOCK |
1213                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1214         unsigned int new_hash, old_hash = sk->sk_hash;
1215         struct unix_sock *u = unix_sk(sk);
1216         struct net *net = sock_net(sk);
1217         struct mnt_idmap *idmap;
1218         struct unix_address *addr;
1219         struct dentry *dentry;
1220         struct path parent;
1221         int err;
1222
1223         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1224         addr = unix_create_addr(sunaddr, addr_len);
1225         if (!addr)
1226                 return -ENOMEM;
1227
1228         /*
1229          * Get the parent directory, calculate the hash for last
1230          * component.
1231          */
1232         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1233         if (IS_ERR(dentry)) {
1234                 err = PTR_ERR(dentry);
1235                 goto out;
1236         }
1237
1238         /*
1239          * All right, let's create it.
1240          */
1241         idmap = mnt_idmap(parent.mnt);
1242         err = security_path_mknod(&parent, dentry, mode, 0);
1243         if (!err)
1244                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1245         if (err)
1246                 goto out_path;
1247         err = mutex_lock_interruptible(&u->bindlock);
1248         if (err)
1249                 goto out_unlink;
1250         if (u->addr)
1251                 goto out_unlock;
1252
1253         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1254         unix_table_double_lock(net, old_hash, new_hash);
1255         u->path.mnt = mntget(parent.mnt);
1256         u->path.dentry = dget(dentry);
1257         __unix_set_addr_hash(net, sk, addr, new_hash);
1258         unix_table_double_unlock(net, old_hash, new_hash);
1259         unix_insert_bsd_socket(sk);
1260         mutex_unlock(&u->bindlock);
1261         done_path_create(&parent, dentry);
1262         return 0;
1263
1264 out_unlock:
1265         mutex_unlock(&u->bindlock);
1266         err = -EINVAL;
1267 out_unlink:
1268         /* failed after successful mknod?  unlink what we'd created... */
1269         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1270 out_path:
1271         done_path_create(&parent, dentry);
1272 out:
1273         unix_release_addr(addr);
1274         return err == -EEXIST ? -EADDRINUSE : err;
1275 }
1276
1277 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1278                               int addr_len)
1279 {
1280         unsigned int new_hash, old_hash = sk->sk_hash;
1281         struct unix_sock *u = unix_sk(sk);
1282         struct net *net = sock_net(sk);
1283         struct unix_address *addr;
1284         int err;
1285
1286         addr = unix_create_addr(sunaddr, addr_len);
1287         if (!addr)
1288                 return -ENOMEM;
1289
1290         err = mutex_lock_interruptible(&u->bindlock);
1291         if (err)
1292                 goto out;
1293
1294         if (u->addr) {
1295                 err = -EINVAL;
1296                 goto out_mutex;
1297         }
1298
1299         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1300         unix_table_double_lock(net, old_hash, new_hash);
1301
1302         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1303                 goto out_spin;
1304
1305         __unix_set_addr_hash(net, sk, addr, new_hash);
1306         unix_table_double_unlock(net, old_hash, new_hash);
1307         mutex_unlock(&u->bindlock);
1308         return 0;
1309
1310 out_spin:
1311         unix_table_double_unlock(net, old_hash, new_hash);
1312         err = -EADDRINUSE;
1313 out_mutex:
1314         mutex_unlock(&u->bindlock);
1315 out:
1316         unix_release_addr(addr);
1317         return err;
1318 }
1319
1320 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1321 {
1322         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1323         struct sock *sk = sock->sk;
1324         int err;
1325
1326         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1327             sunaddr->sun_family == AF_UNIX)
1328                 return unix_autobind(sk);
1329
1330         err = unix_validate_addr(sunaddr, addr_len);
1331         if (err)
1332                 return err;
1333
1334         if (sunaddr->sun_path[0])
1335                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1336         else
1337                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1338
1339         return err;
1340 }
1341
1342 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1343 {
1344         if (unlikely(sk1 == sk2) || !sk2) {
1345                 unix_state_lock(sk1);
1346                 return;
1347         }
1348         if (sk1 < sk2) {
1349                 unix_state_lock(sk1);
1350                 unix_state_lock_nested(sk2);
1351         } else {
1352                 unix_state_lock(sk2);
1353                 unix_state_lock_nested(sk1);
1354         }
1355 }
1356
1357 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1358 {
1359         if (unlikely(sk1 == sk2) || !sk2) {
1360                 unix_state_unlock(sk1);
1361                 return;
1362         }
1363         unix_state_unlock(sk1);
1364         unix_state_unlock(sk2);
1365 }
1366
1367 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1368                               int alen, int flags)
1369 {
1370         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1371         struct sock *sk = sock->sk;
1372         struct sock *other;
1373         int err;
1374
1375         err = -EINVAL;
1376         if (alen < offsetofend(struct sockaddr, sa_family))
1377                 goto out;
1378
1379         if (addr->sa_family != AF_UNSPEC) {
1380                 err = unix_validate_addr(sunaddr, alen);
1381                 if (err)
1382                         goto out;
1383
1384                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1385                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1386                     !unix_sk(sk)->addr) {
1387                         err = unix_autobind(sk);
1388                         if (err)
1389                                 goto out;
1390                 }
1391
1392 restart:
1393                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1394                 if (IS_ERR(other)) {
1395                         err = PTR_ERR(other);
1396                         goto out;
1397                 }
1398
1399                 unix_state_double_lock(sk, other);
1400
1401                 /* Apparently VFS overslept socket death. Retry. */
1402                 if (sock_flag(other, SOCK_DEAD)) {
1403                         unix_state_double_unlock(sk, other);
1404                         sock_put(other);
1405                         goto restart;
1406                 }
1407
1408                 err = -EPERM;
1409                 if (!unix_may_send(sk, other))
1410                         goto out_unlock;
1411
1412                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1413                 if (err)
1414                         goto out_unlock;
1415
1416                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1417         } else {
1418                 /*
1419                  *      1003.1g breaking connected state with AF_UNSPEC
1420                  */
1421                 other = NULL;
1422                 unix_state_double_lock(sk, other);
1423         }
1424
1425         /*
1426          * If it was connected, reconnect.
1427          */
1428         if (unix_peer(sk)) {
1429                 struct sock *old_peer = unix_peer(sk);
1430
1431                 unix_peer(sk) = other;
1432                 if (!other)
1433                         sk->sk_state = TCP_CLOSE;
1434                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1435
1436                 unix_state_double_unlock(sk, other);
1437
1438                 if (other != old_peer)
1439                         unix_dgram_disconnected(sk, old_peer);
1440                 sock_put(old_peer);
1441         } else {
1442                 unix_peer(sk) = other;
1443                 unix_state_double_unlock(sk, other);
1444         }
1445
1446         return 0;
1447
1448 out_unlock:
1449         unix_state_double_unlock(sk, other);
1450         sock_put(other);
1451 out:
1452         return err;
1453 }
1454
1455 static long unix_wait_for_peer(struct sock *other, long timeo)
1456         __releases(&unix_sk(other)->lock)
1457 {
1458         struct unix_sock *u = unix_sk(other);
1459         int sched;
1460         DEFINE_WAIT(wait);
1461
1462         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1463
1464         sched = !sock_flag(other, SOCK_DEAD) &&
1465                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1466                 unix_recvq_full_lockless(other);
1467
1468         unix_state_unlock(other);
1469
1470         if (sched)
1471                 timeo = schedule_timeout(timeo);
1472
1473         finish_wait(&u->peer_wait, &wait);
1474         return timeo;
1475 }
1476
1477 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1478                                int addr_len, int flags)
1479 {
1480         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1481         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1482         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1483         struct net *net = sock_net(sk);
1484         struct sk_buff *skb = NULL;
1485         long timeo;
1486         int err;
1487         int st;
1488
1489         err = unix_validate_addr(sunaddr, addr_len);
1490         if (err)
1491                 goto out;
1492
1493         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1495                 err = unix_autobind(sk);
1496                 if (err)
1497                         goto out;
1498         }
1499
1500         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1501
1502         /* First of all allocate resources.
1503            If we will make it after state is locked,
1504            we will have to recheck all again in any case.
1505          */
1506
1507         /* create new sock for complete connection */
1508         newsk = unix_create1(net, NULL, 0, sock->type);
1509         if (IS_ERR(newsk)) {
1510                 err = PTR_ERR(newsk);
1511                 newsk = NULL;
1512                 goto out;
1513         }
1514
1515         err = -ENOMEM;
1516
1517         /* Allocate skb for sending to listening sock */
1518         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1519         if (skb == NULL)
1520                 goto out;
1521
1522 restart:
1523         /*  Find listening sock. */
1524         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1525         if (IS_ERR(other)) {
1526                 err = PTR_ERR(other);
1527                 other = NULL;
1528                 goto out;
1529         }
1530
1531         /* Latch state of peer */
1532         unix_state_lock(other);
1533
1534         /* Apparently VFS overslept socket death. Retry. */
1535         if (sock_flag(other, SOCK_DEAD)) {
1536                 unix_state_unlock(other);
1537                 sock_put(other);
1538                 goto restart;
1539         }
1540
1541         err = -ECONNREFUSED;
1542         if (other->sk_state != TCP_LISTEN)
1543                 goto out_unlock;
1544         if (other->sk_shutdown & RCV_SHUTDOWN)
1545                 goto out_unlock;
1546
1547         if (unix_recvq_full(other)) {
1548                 err = -EAGAIN;
1549                 if (!timeo)
1550                         goto out_unlock;
1551
1552                 timeo = unix_wait_for_peer(other, timeo);
1553
1554                 err = sock_intr_errno(timeo);
1555                 if (signal_pending(current))
1556                         goto out;
1557                 sock_put(other);
1558                 goto restart;
1559         }
1560
1561         /* Latch our state.
1562
1563            It is tricky place. We need to grab our state lock and cannot
1564            drop lock on peer. It is dangerous because deadlock is
1565            possible. Connect to self case and simultaneous
1566            attempt to connect are eliminated by checking socket
1567            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1568            check this before attempt to grab lock.
1569
1570            Well, and we have to recheck the state after socket locked.
1571          */
1572         st = sk->sk_state;
1573
1574         switch (st) {
1575         case TCP_CLOSE:
1576                 /* This is ok... continue with connect */
1577                 break;
1578         case TCP_ESTABLISHED:
1579                 /* Socket is already connected */
1580                 err = -EISCONN;
1581                 goto out_unlock;
1582         default:
1583                 err = -EINVAL;
1584                 goto out_unlock;
1585         }
1586
1587         unix_state_lock_nested(sk);
1588
1589         if (sk->sk_state != st) {
1590                 unix_state_unlock(sk);
1591                 unix_state_unlock(other);
1592                 sock_put(other);
1593                 goto restart;
1594         }
1595
1596         err = security_unix_stream_connect(sk, other, newsk);
1597         if (err) {
1598                 unix_state_unlock(sk);
1599                 goto out_unlock;
1600         }
1601
1602         /* The way is open! Fastly set all the necessary fields... */
1603
1604         sock_hold(sk);
1605         unix_peer(newsk)        = sk;
1606         newsk->sk_state         = TCP_ESTABLISHED;
1607         newsk->sk_type          = sk->sk_type;
1608         init_peercred(newsk);
1609         newu = unix_sk(newsk);
1610         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1611         otheru = unix_sk(other);
1612
1613         /* copy address information from listening to new sock
1614          *
1615          * The contents of *(otheru->addr) and otheru->path
1616          * are seen fully set up here, since we have found
1617          * otheru in hash under its lock.  Insertion into the
1618          * hash chain we'd found it in had been done in an
1619          * earlier critical area protected by the chain's lock,
1620          * the same one where we'd set *(otheru->addr) contents,
1621          * as well as otheru->path and otheru->addr itself.
1622          *
1623          * Using smp_store_release() here to set newu->addr
1624          * is enough to make those stores, as well as stores
1625          * to newu->path visible to anyone who gets newu->addr
1626          * by smp_load_acquire().  IOW, the same warranties
1627          * as for unix_sock instances bound in unix_bind() or
1628          * in unix_autobind().
1629          */
1630         if (otheru->path.dentry) {
1631                 path_get(&otheru->path);
1632                 newu->path = otheru->path;
1633         }
1634         refcount_inc(&otheru->addr->refcnt);
1635         smp_store_release(&newu->addr, otheru->addr);
1636
1637         /* Set credentials */
1638         copy_peercred(sk, other);
1639
1640         sock->state     = SS_CONNECTED;
1641         sk->sk_state    = TCP_ESTABLISHED;
1642         sock_hold(newsk);
1643
1644         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1645         unix_peer(sk)   = newsk;
1646
1647         unix_state_unlock(sk);
1648
1649         /* take ten and send info to listening sock */
1650         spin_lock(&other->sk_receive_queue.lock);
1651         __skb_queue_tail(&other->sk_receive_queue, skb);
1652         spin_unlock(&other->sk_receive_queue.lock);
1653         unix_state_unlock(other);
1654         other->sk_data_ready(other);
1655         sock_put(other);
1656         return 0;
1657
1658 out_unlock:
1659         if (other)
1660                 unix_state_unlock(other);
1661
1662 out:
1663         kfree_skb(skb);
1664         if (newsk)
1665                 unix_release_sock(newsk, 0);
1666         if (other)
1667                 sock_put(other);
1668         return err;
1669 }
1670
1671 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1672 {
1673         struct sock *ska = socka->sk, *skb = sockb->sk;
1674
1675         /* Join our sockets back to back */
1676         sock_hold(ska);
1677         sock_hold(skb);
1678         unix_peer(ska) = skb;
1679         unix_peer(skb) = ska;
1680         init_peercred(ska);
1681         init_peercred(skb);
1682
1683         ska->sk_state = TCP_ESTABLISHED;
1684         skb->sk_state = TCP_ESTABLISHED;
1685         socka->state  = SS_CONNECTED;
1686         sockb->state  = SS_CONNECTED;
1687         return 0;
1688 }
1689
1690 static void unix_sock_inherit_flags(const struct socket *old,
1691                                     struct socket *new)
1692 {
1693         if (test_bit(SOCK_PASSCRED, &old->flags))
1694                 set_bit(SOCK_PASSCRED, &new->flags);
1695         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1696                 set_bit(SOCK_PASSPIDFD, &new->flags);
1697         if (test_bit(SOCK_PASSSEC, &old->flags))
1698                 set_bit(SOCK_PASSSEC, &new->flags);
1699 }
1700
1701 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1702                        bool kern)
1703 {
1704         struct sock *sk = sock->sk;
1705         struct sock *tsk;
1706         struct sk_buff *skb;
1707         int err;
1708
1709         err = -EOPNOTSUPP;
1710         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1711                 goto out;
1712
1713         err = -EINVAL;
1714         if (sk->sk_state != TCP_LISTEN)
1715                 goto out;
1716
1717         /* If socket state is TCP_LISTEN it cannot change (for now...),
1718          * so that no locks are necessary.
1719          */
1720
1721         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1722                                 &err);
1723         if (!skb) {
1724                 /* This means receive shutdown. */
1725                 if (err == 0)
1726                         err = -EINVAL;
1727                 goto out;
1728         }
1729
1730         tsk = skb->sk;
1731         skb_free_datagram(sk, skb);
1732         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1733
1734         /* attach accepted sock to socket */
1735         unix_state_lock(tsk);
1736         newsock->state = SS_CONNECTED;
1737         unix_sock_inherit_flags(sock, newsock);
1738         sock_graft(tsk, newsock);
1739         unix_state_unlock(tsk);
1740         return 0;
1741
1742 out:
1743         return err;
1744 }
1745
1746
1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1748 {
1749         struct sock *sk = sock->sk;
1750         struct unix_address *addr;
1751         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1752         int err = 0;
1753
1754         if (peer) {
1755                 sk = unix_peer_get(sk);
1756
1757                 err = -ENOTCONN;
1758                 if (!sk)
1759                         goto out;
1760                 err = 0;
1761         } else {
1762                 sock_hold(sk);
1763         }
1764
1765         addr = smp_load_acquire(&unix_sk(sk)->addr);
1766         if (!addr) {
1767                 sunaddr->sun_family = AF_UNIX;
1768                 sunaddr->sun_path[0] = 0;
1769                 err = offsetof(struct sockaddr_un, sun_path);
1770         } else {
1771                 err = addr->len;
1772                 memcpy(sunaddr, addr->name, addr->len);
1773         }
1774         sock_put(sk);
1775 out:
1776         return err;
1777 }
1778
1779 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1780 {
1781         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1782
1783         /*
1784          * Garbage collection of unix sockets starts by selecting a set of
1785          * candidate sockets which have reference only from being in flight
1786          * (total_refs == inflight_refs).  This condition is checked once during
1787          * the candidate collection phase, and candidates are marked as such, so
1788          * that non-candidates can later be ignored.  While inflight_refs is
1789          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1790          * is an instantaneous decision.
1791          *
1792          * Once a candidate, however, the socket must not be reinstalled into a
1793          * file descriptor while the garbage collection is in progress.
1794          *
1795          * If the above conditions are met, then the directed graph of
1796          * candidates (*) does not change while unix_gc_lock is held.
1797          *
1798          * Any operations that changes the file count through file descriptors
1799          * (dup, close, sendmsg) does not change the graph since candidates are
1800          * not installed in fds.
1801          *
1802          * Dequeing a candidate via recvmsg would install it into an fd, but
1803          * that takes unix_gc_lock to decrement the inflight count, so it's
1804          * serialized with garbage collection.
1805          *
1806          * MSG_PEEK is special in that it does not change the inflight count,
1807          * yet does install the socket into an fd.  The following lock/unlock
1808          * pair is to ensure serialization with garbage collection.  It must be
1809          * done between incrementing the file count and installing the file into
1810          * an fd.
1811          *
1812          * If garbage collection starts after the barrier provided by the
1813          * lock/unlock, then it will see the elevated refcount and not mark this
1814          * as a candidate.  If a garbage collection is already in progress
1815          * before the file count was incremented, then the lock/unlock pair will
1816          * ensure that garbage collection is finished before progressing to
1817          * installing the fd.
1818          *
1819          * (*) A -> B where B is on the queue of A or B is on the queue of C
1820          * which is on the queue of listening socket A.
1821          */
1822         spin_lock(&unix_gc_lock);
1823         spin_unlock(&unix_gc_lock);
1824 }
1825
1826 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1827 {
1828         int err = 0;
1829
1830         UNIXCB(skb).pid  = get_pid(scm->pid);
1831         UNIXCB(skb).uid = scm->creds.uid;
1832         UNIXCB(skb).gid = scm->creds.gid;
1833         UNIXCB(skb).fp = NULL;
1834         unix_get_secdata(scm, skb);
1835         if (scm->fp && send_fds)
1836                 err = unix_attach_fds(scm, skb);
1837
1838         skb->destructor = unix_destruct_scm;
1839         return err;
1840 }
1841
1842 static bool unix_passcred_enabled(const struct socket *sock,
1843                                   const struct sock *other)
1844 {
1845         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1846                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1847                !other->sk_socket ||
1848                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1849                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1850 }
1851
1852 /*
1853  * Some apps rely on write() giving SCM_CREDENTIALS
1854  * We include credentials if source or destination socket
1855  * asserted SOCK_PASSCRED.
1856  */
1857 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1858                             const struct sock *other)
1859 {
1860         if (UNIXCB(skb).pid)
1861                 return;
1862         if (unix_passcred_enabled(sock, other)) {
1863                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1864                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1865         }
1866 }
1867
1868 static bool unix_skb_scm_eq(struct sk_buff *skb,
1869                             struct scm_cookie *scm)
1870 {
1871         return UNIXCB(skb).pid == scm->pid &&
1872                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1873                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1874                unix_secdata_eq(scm, skb);
1875 }
1876
1877 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1878 {
1879         struct scm_fp_list *fp = UNIXCB(skb).fp;
1880         struct unix_sock *u = unix_sk(sk);
1881
1882         if (unlikely(fp && fp->count))
1883                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1884 }
1885
1886 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1887 {
1888         struct scm_fp_list *fp = UNIXCB(skb).fp;
1889         struct unix_sock *u = unix_sk(sk);
1890
1891         if (unlikely(fp && fp->count))
1892                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1893 }
1894
1895 /*
1896  *      Send AF_UNIX data.
1897  */
1898
1899 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1900                               size_t len)
1901 {
1902         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1903         struct sock *sk = sock->sk, *other = NULL;
1904         struct unix_sock *u = unix_sk(sk);
1905         struct scm_cookie scm;
1906         struct sk_buff *skb;
1907         int data_len = 0;
1908         int sk_locked;
1909         long timeo;
1910         int err;
1911
1912         wait_for_unix_gc();
1913         err = scm_send(sock, msg, &scm, false);
1914         if (err < 0)
1915                 return err;
1916
1917         err = -EOPNOTSUPP;
1918         if (msg->msg_flags&MSG_OOB)
1919                 goto out;
1920
1921         if (msg->msg_namelen) {
1922                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1923                 if (err)
1924                         goto out;
1925         } else {
1926                 sunaddr = NULL;
1927                 err = -ENOTCONN;
1928                 other = unix_peer_get(sk);
1929                 if (!other)
1930                         goto out;
1931         }
1932
1933         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1934              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1935                 err = unix_autobind(sk);
1936                 if (err)
1937                         goto out;
1938         }
1939
1940         err = -EMSGSIZE;
1941         if (len > sk->sk_sndbuf - 32)
1942                 goto out;
1943
1944         if (len > SKB_MAX_ALLOC) {
1945                 data_len = min_t(size_t,
1946                                  len - SKB_MAX_ALLOC,
1947                                  MAX_SKB_FRAGS * PAGE_SIZE);
1948                 data_len = PAGE_ALIGN(data_len);
1949
1950                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1951         }
1952
1953         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1954                                    msg->msg_flags & MSG_DONTWAIT, &err,
1955                                    PAGE_ALLOC_COSTLY_ORDER);
1956         if (skb == NULL)
1957                 goto out;
1958
1959         err = unix_scm_to_skb(&scm, skb, true);
1960         if (err < 0)
1961                 goto out_free;
1962
1963         skb_put(skb, len - data_len);
1964         skb->data_len = data_len;
1965         skb->len = len;
1966         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1967         if (err)
1968                 goto out_free;
1969
1970         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1971
1972 restart:
1973         if (!other) {
1974                 err = -ECONNRESET;
1975                 if (sunaddr == NULL)
1976                         goto out_free;
1977
1978                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1979                                         sk->sk_type);
1980                 if (IS_ERR(other)) {
1981                         err = PTR_ERR(other);
1982                         other = NULL;
1983                         goto out_free;
1984                 }
1985         }
1986
1987         if (sk_filter(other, skb) < 0) {
1988                 /* Toss the packet but do not return any error to the sender */
1989                 err = len;
1990                 goto out_free;
1991         }
1992
1993         sk_locked = 0;
1994         unix_state_lock(other);
1995 restart_locked:
1996         err = -EPERM;
1997         if (!unix_may_send(sk, other))
1998                 goto out_unlock;
1999
2000         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2001                 /*
2002                  *      Check with 1003.1g - what should
2003                  *      datagram error
2004                  */
2005                 unix_state_unlock(other);
2006                 sock_put(other);
2007
2008                 if (!sk_locked)
2009                         unix_state_lock(sk);
2010
2011                 err = 0;
2012                 if (sk->sk_type == SOCK_SEQPACKET) {
2013                         /* We are here only when racing with unix_release_sock()
2014                          * is clearing @other. Never change state to TCP_CLOSE
2015                          * unlike SOCK_DGRAM wants.
2016                          */
2017                         unix_state_unlock(sk);
2018                         err = -EPIPE;
2019                 } else if (unix_peer(sk) == other) {
2020                         unix_peer(sk) = NULL;
2021                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2022
2023                         sk->sk_state = TCP_CLOSE;
2024                         unix_state_unlock(sk);
2025
2026                         unix_dgram_disconnected(sk, other);
2027                         sock_put(other);
2028                         err = -ECONNREFUSED;
2029                 } else {
2030                         unix_state_unlock(sk);
2031                 }
2032
2033                 other = NULL;
2034                 if (err)
2035                         goto out_free;
2036                 goto restart;
2037         }
2038
2039         err = -EPIPE;
2040         if (other->sk_shutdown & RCV_SHUTDOWN)
2041                 goto out_unlock;
2042
2043         if (sk->sk_type != SOCK_SEQPACKET) {
2044                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2045                 if (err)
2046                         goto out_unlock;
2047         }
2048
2049         /* other == sk && unix_peer(other) != sk if
2050          * - unix_peer(sk) == NULL, destination address bound to sk
2051          * - unix_peer(sk) == sk by time of get but disconnected before lock
2052          */
2053         if (other != sk &&
2054             unlikely(unix_peer(other) != sk &&
2055             unix_recvq_full_lockless(other))) {
2056                 if (timeo) {
2057                         timeo = unix_wait_for_peer(other, timeo);
2058
2059                         err = sock_intr_errno(timeo);
2060                         if (signal_pending(current))
2061                                 goto out_free;
2062
2063                         goto restart;
2064                 }
2065
2066                 if (!sk_locked) {
2067                         unix_state_unlock(other);
2068                         unix_state_double_lock(sk, other);
2069                 }
2070
2071                 if (unix_peer(sk) != other ||
2072                     unix_dgram_peer_wake_me(sk, other)) {
2073                         err = -EAGAIN;
2074                         sk_locked = 1;
2075                         goto out_unlock;
2076                 }
2077
2078                 if (!sk_locked) {
2079                         sk_locked = 1;
2080                         goto restart_locked;
2081                 }
2082         }
2083
2084         if (unlikely(sk_locked))
2085                 unix_state_unlock(sk);
2086
2087         if (sock_flag(other, SOCK_RCVTSTAMP))
2088                 __net_timestamp(skb);
2089         maybe_add_creds(skb, sock, other);
2090         scm_stat_add(other, skb);
2091         skb_queue_tail(&other->sk_receive_queue, skb);
2092         unix_state_unlock(other);
2093         other->sk_data_ready(other);
2094         sock_put(other);
2095         scm_destroy(&scm);
2096         return len;
2097
2098 out_unlock:
2099         if (sk_locked)
2100                 unix_state_unlock(sk);
2101         unix_state_unlock(other);
2102 out_free:
2103         kfree_skb(skb);
2104 out:
2105         if (other)
2106                 sock_put(other);
2107         scm_destroy(&scm);
2108         return err;
2109 }
2110
2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2112  * bytes, and a minimum of a full page.
2113  */
2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2115
2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2118                      struct scm_cookie *scm, bool fds_sent)
2119 {
2120         struct unix_sock *ousk = unix_sk(other);
2121         struct sk_buff *skb;
2122         int err = 0;
2123
2124         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2125
2126         if (!skb)
2127                 return err;
2128
2129         err = unix_scm_to_skb(scm, skb, !fds_sent);
2130         if (err < 0) {
2131                 kfree_skb(skb);
2132                 return err;
2133         }
2134         skb_put(skb, 1);
2135         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2136
2137         if (err) {
2138                 kfree_skb(skb);
2139                 return err;
2140         }
2141
2142         unix_state_lock(other);
2143
2144         if (sock_flag(other, SOCK_DEAD) ||
2145             (other->sk_shutdown & RCV_SHUTDOWN)) {
2146                 unix_state_unlock(other);
2147                 kfree_skb(skb);
2148                 return -EPIPE;
2149         }
2150
2151         maybe_add_creds(skb, sock, other);
2152         skb_get(skb);
2153
2154         if (ousk->oob_skb)
2155                 consume_skb(ousk->oob_skb);
2156
2157         WRITE_ONCE(ousk->oob_skb, skb);
2158
2159         scm_stat_add(other, skb);
2160         skb_queue_tail(&other->sk_receive_queue, skb);
2161         sk_send_sigurg(other);
2162         unix_state_unlock(other);
2163         other->sk_data_ready(other);
2164
2165         return err;
2166 }
2167 #endif
2168
2169 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2170                                size_t len)
2171 {
2172         struct sock *sk = sock->sk;
2173         struct sock *other = NULL;
2174         int err, size;
2175         struct sk_buff *skb;
2176         int sent = 0;
2177         struct scm_cookie scm;
2178         bool fds_sent = false;
2179         int data_len;
2180
2181         wait_for_unix_gc();
2182         err = scm_send(sock, msg, &scm, false);
2183         if (err < 0)
2184                 return err;
2185
2186         err = -EOPNOTSUPP;
2187         if (msg->msg_flags & MSG_OOB) {
2188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2189                 if (len)
2190                         len--;
2191                 else
2192 #endif
2193                         goto out_err;
2194         }
2195
2196         if (msg->msg_namelen) {
2197                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2198                 goto out_err;
2199         } else {
2200                 err = -ENOTCONN;
2201                 other = unix_peer(sk);
2202                 if (!other)
2203                         goto out_err;
2204         }
2205
2206         if (sk->sk_shutdown & SEND_SHUTDOWN)
2207                 goto pipe_err;
2208
2209         while (sent < len) {
2210                 size = len - sent;
2211
2212                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2213                         skb = sock_alloc_send_pskb(sk, 0, 0,
2214                                                    msg->msg_flags & MSG_DONTWAIT,
2215                                                    &err, 0);
2216                 } else {
2217                         /* Keep two messages in the pipe so it schedules better */
2218                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2219
2220                         /* allow fallback to order-0 allocations */
2221                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2222
2223                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2224
2225                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2226
2227                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2228                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2229                                                    get_order(UNIX_SKB_FRAGS_SZ));
2230                 }
2231                 if (!skb)
2232                         goto out_err;
2233
2234                 /* Only send the fds in the first buffer */
2235                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2236                 if (err < 0) {
2237                         kfree_skb(skb);
2238                         goto out_err;
2239                 }
2240                 fds_sent = true;
2241
2242                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2243                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2244                                                    sk->sk_allocation);
2245                         if (err < 0) {
2246                                 kfree_skb(skb);
2247                                 goto out_err;
2248                         }
2249                         size = err;
2250                         refcount_add(size, &sk->sk_wmem_alloc);
2251                 } else {
2252                         skb_put(skb, size - data_len);
2253                         skb->data_len = data_len;
2254                         skb->len = size;
2255                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2256                         if (err) {
2257                                 kfree_skb(skb);
2258                                 goto out_err;
2259                         }
2260                 }
2261
2262                 unix_state_lock(other);
2263
2264                 if (sock_flag(other, SOCK_DEAD) ||
2265                     (other->sk_shutdown & RCV_SHUTDOWN))
2266                         goto pipe_err_free;
2267
2268                 maybe_add_creds(skb, sock, other);
2269                 scm_stat_add(other, skb);
2270                 skb_queue_tail(&other->sk_receive_queue, skb);
2271                 unix_state_unlock(other);
2272                 other->sk_data_ready(other);
2273                 sent += size;
2274         }
2275
2276 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2277         if (msg->msg_flags & MSG_OOB) {
2278                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2279                 if (err)
2280                         goto out_err;
2281                 sent++;
2282         }
2283 #endif
2284
2285         scm_destroy(&scm);
2286
2287         return sent;
2288
2289 pipe_err_free:
2290         unix_state_unlock(other);
2291         kfree_skb(skb);
2292 pipe_err:
2293         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2294                 send_sig(SIGPIPE, current, 0);
2295         err = -EPIPE;
2296 out_err:
2297         scm_destroy(&scm);
2298         return sent ? : err;
2299 }
2300
2301 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2302                                   size_t len)
2303 {
2304         int err;
2305         struct sock *sk = sock->sk;
2306
2307         err = sock_error(sk);
2308         if (err)
2309                 return err;
2310
2311         if (sk->sk_state != TCP_ESTABLISHED)
2312                 return -ENOTCONN;
2313
2314         if (msg->msg_namelen)
2315                 msg->msg_namelen = 0;
2316
2317         return unix_dgram_sendmsg(sock, msg, len);
2318 }
2319
2320 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2321                                   size_t size, int flags)
2322 {
2323         struct sock *sk = sock->sk;
2324
2325         if (sk->sk_state != TCP_ESTABLISHED)
2326                 return -ENOTCONN;
2327
2328         return unix_dgram_recvmsg(sock, msg, size, flags);
2329 }
2330
2331 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2332 {
2333         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2334
2335         if (addr) {
2336                 msg->msg_namelen = addr->len;
2337                 memcpy(msg->msg_name, addr->name, addr->len);
2338         }
2339 }
2340
2341 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2342                          int flags)
2343 {
2344         struct scm_cookie scm;
2345         struct socket *sock = sk->sk_socket;
2346         struct unix_sock *u = unix_sk(sk);
2347         struct sk_buff *skb, *last;
2348         long timeo;
2349         int skip;
2350         int err;
2351
2352         err = -EOPNOTSUPP;
2353         if (flags&MSG_OOB)
2354                 goto out;
2355
2356         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2357
2358         do {
2359                 mutex_lock(&u->iolock);
2360
2361                 skip = sk_peek_offset(sk, flags);
2362                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2363                                               &skip, &err, &last);
2364                 if (skb) {
2365                         if (!(flags & MSG_PEEK))
2366                                 scm_stat_del(sk, skb);
2367                         break;
2368                 }
2369
2370                 mutex_unlock(&u->iolock);
2371
2372                 if (err != -EAGAIN)
2373                         break;
2374         } while (timeo &&
2375                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2376                                               &err, &timeo, last));
2377
2378         if (!skb) { /* implies iolock unlocked */
2379                 unix_state_lock(sk);
2380                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2381                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2382                     (sk->sk_shutdown & RCV_SHUTDOWN))
2383                         err = 0;
2384                 unix_state_unlock(sk);
2385                 goto out;
2386         }
2387
2388         if (wq_has_sleeper(&u->peer_wait))
2389                 wake_up_interruptible_sync_poll(&u->peer_wait,
2390                                                 EPOLLOUT | EPOLLWRNORM |
2391                                                 EPOLLWRBAND);
2392
2393         if (msg->msg_name)
2394                 unix_copy_addr(msg, skb->sk);
2395
2396         if (size > skb->len - skip)
2397                 size = skb->len - skip;
2398         else if (size < skb->len - skip)
2399                 msg->msg_flags |= MSG_TRUNC;
2400
2401         err = skb_copy_datagram_msg(skb, skip, msg, size);
2402         if (err)
2403                 goto out_free;
2404
2405         if (sock_flag(sk, SOCK_RCVTSTAMP))
2406                 __sock_recv_timestamp(msg, sk, skb);
2407
2408         memset(&scm, 0, sizeof(scm));
2409
2410         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2411         unix_set_secdata(&scm, skb);
2412
2413         if (!(flags & MSG_PEEK)) {
2414                 if (UNIXCB(skb).fp)
2415                         unix_detach_fds(&scm, skb);
2416
2417                 sk_peek_offset_bwd(sk, skb->len);
2418         } else {
2419                 /* It is questionable: on PEEK we could:
2420                    - do not return fds - good, but too simple 8)
2421                    - return fds, and do not return them on read (old strategy,
2422                      apparently wrong)
2423                    - clone fds (I chose it for now, it is the most universal
2424                      solution)
2425
2426                    POSIX 1003.1g does not actually define this clearly
2427                    at all. POSIX 1003.1g doesn't define a lot of things
2428                    clearly however!
2429
2430                 */
2431
2432                 sk_peek_offset_fwd(sk, size);
2433
2434                 if (UNIXCB(skb).fp)
2435                         unix_peek_fds(&scm, skb);
2436         }
2437         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2438
2439         scm_recv_unix(sock, msg, &scm, flags);
2440
2441 out_free:
2442         skb_free_datagram(sk, skb);
2443         mutex_unlock(&u->iolock);
2444 out:
2445         return err;
2446 }
2447
2448 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2449                               int flags)
2450 {
2451         struct sock *sk = sock->sk;
2452
2453 #ifdef CONFIG_BPF_SYSCALL
2454         const struct proto *prot = READ_ONCE(sk->sk_prot);
2455
2456         if (prot != &unix_dgram_proto)
2457                 return prot->recvmsg(sk, msg, size, flags, NULL);
2458 #endif
2459         return __unix_dgram_recvmsg(sk, msg, size, flags);
2460 }
2461
2462 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2463 {
2464         struct unix_sock *u = unix_sk(sk);
2465         struct sk_buff *skb;
2466         int err;
2467
2468         mutex_lock(&u->iolock);
2469         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2470         mutex_unlock(&u->iolock);
2471         if (!skb)
2472                 return err;
2473
2474         return recv_actor(sk, skb);
2475 }
2476
2477 /*
2478  *      Sleep until more data has arrived. But check for races..
2479  */
2480 static long unix_stream_data_wait(struct sock *sk, long timeo,
2481                                   struct sk_buff *last, unsigned int last_len,
2482                                   bool freezable)
2483 {
2484         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2485         struct sk_buff *tail;
2486         DEFINE_WAIT(wait);
2487
2488         unix_state_lock(sk);
2489
2490         for (;;) {
2491                 prepare_to_wait(sk_sleep(sk), &wait, state);
2492
2493                 tail = skb_peek_tail(&sk->sk_receive_queue);
2494                 if (tail != last ||
2495                     (tail && tail->len != last_len) ||
2496                     sk->sk_err ||
2497                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2498                     signal_pending(current) ||
2499                     !timeo)
2500                         break;
2501
2502                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2503                 unix_state_unlock(sk);
2504                 timeo = schedule_timeout(timeo);
2505                 unix_state_lock(sk);
2506
2507                 if (sock_flag(sk, SOCK_DEAD))
2508                         break;
2509
2510                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2511         }
2512
2513         finish_wait(sk_sleep(sk), &wait);
2514         unix_state_unlock(sk);
2515         return timeo;
2516 }
2517
2518 static unsigned int unix_skb_len(const struct sk_buff *skb)
2519 {
2520         return skb->len - UNIXCB(skb).consumed;
2521 }
2522
2523 struct unix_stream_read_state {
2524         int (*recv_actor)(struct sk_buff *, int, int,
2525                           struct unix_stream_read_state *);
2526         struct socket *socket;
2527         struct msghdr *msg;
2528         struct pipe_inode_info *pipe;
2529         size_t size;
2530         int flags;
2531         unsigned int splice_flags;
2532 };
2533
2534 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2535 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2536 {
2537         struct socket *sock = state->socket;
2538         struct sock *sk = sock->sk;
2539         struct unix_sock *u = unix_sk(sk);
2540         int chunk = 1;
2541         struct sk_buff *oob_skb;
2542
2543         mutex_lock(&u->iolock);
2544         unix_state_lock(sk);
2545
2546         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2547                 unix_state_unlock(sk);
2548                 mutex_unlock(&u->iolock);
2549                 return -EINVAL;
2550         }
2551
2552         oob_skb = u->oob_skb;
2553
2554         if (!(state->flags & MSG_PEEK))
2555                 WRITE_ONCE(u->oob_skb, NULL);
2556         else
2557                 skb_get(oob_skb);
2558         unix_state_unlock(sk);
2559
2560         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2561
2562         if (!(state->flags & MSG_PEEK))
2563                 UNIXCB(oob_skb).consumed += 1;
2564
2565         consume_skb(oob_skb);
2566
2567         mutex_unlock(&u->iolock);
2568
2569         if (chunk < 0)
2570                 return -EFAULT;
2571
2572         state->msg->msg_flags |= MSG_OOB;
2573         return 1;
2574 }
2575
2576 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2577                                   int flags, int copied)
2578 {
2579         struct unix_sock *u = unix_sk(sk);
2580
2581         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2582                 skb_unlink(skb, &sk->sk_receive_queue);
2583                 consume_skb(skb);
2584                 skb = NULL;
2585         } else {
2586                 if (skb == u->oob_skb) {
2587                         if (copied) {
2588                                 skb = NULL;
2589                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2590                                 if (!(flags & MSG_PEEK)) {
2591                                         WRITE_ONCE(u->oob_skb, NULL);
2592                                         consume_skb(skb);
2593                                 }
2594                         } else if (!(flags & MSG_PEEK)) {
2595                                 skb_unlink(skb, &sk->sk_receive_queue);
2596                                 consume_skb(skb);
2597                                 skb = skb_peek(&sk->sk_receive_queue);
2598                         }
2599                 }
2600         }
2601         return skb;
2602 }
2603 #endif
2604
2605 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2606 {
2607         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2608                 return -ENOTCONN;
2609
2610         return unix_read_skb(sk, recv_actor);
2611 }
2612
2613 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2614                                     bool freezable)
2615 {
2616         struct scm_cookie scm;
2617         struct socket *sock = state->socket;
2618         struct sock *sk = sock->sk;
2619         struct unix_sock *u = unix_sk(sk);
2620         int copied = 0;
2621         int flags = state->flags;
2622         int noblock = flags & MSG_DONTWAIT;
2623         bool check_creds = false;
2624         int target;
2625         int err = 0;
2626         long timeo;
2627         int skip;
2628         size_t size = state->size;
2629         unsigned int last_len;
2630
2631         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2632                 err = -EINVAL;
2633                 goto out;
2634         }
2635
2636         if (unlikely(flags & MSG_OOB)) {
2637                 err = -EOPNOTSUPP;
2638 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2639                 err = unix_stream_recv_urg(state);
2640 #endif
2641                 goto out;
2642         }
2643
2644         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2645         timeo = sock_rcvtimeo(sk, noblock);
2646
2647         memset(&scm, 0, sizeof(scm));
2648
2649         /* Lock the socket to prevent queue disordering
2650          * while sleeps in memcpy_tomsg
2651          */
2652         mutex_lock(&u->iolock);
2653
2654         skip = max(sk_peek_offset(sk, flags), 0);
2655
2656         do {
2657                 int chunk;
2658                 bool drop_skb;
2659                 struct sk_buff *skb, *last;
2660
2661 redo:
2662                 unix_state_lock(sk);
2663                 if (sock_flag(sk, SOCK_DEAD)) {
2664                         err = -ECONNRESET;
2665                         goto unlock;
2666                 }
2667                 last = skb = skb_peek(&sk->sk_receive_queue);
2668                 last_len = last ? last->len : 0;
2669
2670 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2671                 if (skb) {
2672                         skb = manage_oob(skb, sk, flags, copied);
2673                         if (!skb) {
2674                                 unix_state_unlock(sk);
2675                                 if (copied)
2676                                         break;
2677                                 goto redo;
2678                         }
2679                 }
2680 #endif
2681 again:
2682                 if (skb == NULL) {
2683                         if (copied >= target)
2684                                 goto unlock;
2685
2686                         /*
2687                          *      POSIX 1003.1g mandates this order.
2688                          */
2689
2690                         err = sock_error(sk);
2691                         if (err)
2692                                 goto unlock;
2693                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2694                                 goto unlock;
2695
2696                         unix_state_unlock(sk);
2697                         if (!timeo) {
2698                                 err = -EAGAIN;
2699                                 break;
2700                         }
2701
2702                         mutex_unlock(&u->iolock);
2703
2704                         timeo = unix_stream_data_wait(sk, timeo, last,
2705                                                       last_len, freezable);
2706
2707                         if (signal_pending(current)) {
2708                                 err = sock_intr_errno(timeo);
2709                                 scm_destroy(&scm);
2710                                 goto out;
2711                         }
2712
2713                         mutex_lock(&u->iolock);
2714                         goto redo;
2715 unlock:
2716                         unix_state_unlock(sk);
2717                         break;
2718                 }
2719
2720                 while (skip >= unix_skb_len(skb)) {
2721                         skip -= unix_skb_len(skb);
2722                         last = skb;
2723                         last_len = skb->len;
2724                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2725                         if (!skb)
2726                                 goto again;
2727                 }
2728
2729                 unix_state_unlock(sk);
2730
2731                 if (check_creds) {
2732                         /* Never glue messages from different writers */
2733                         if (!unix_skb_scm_eq(skb, &scm))
2734                                 break;
2735                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2736                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2737                         /* Copy credentials */
2738                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2739                         unix_set_secdata(&scm, skb);
2740                         check_creds = true;
2741                 }
2742
2743                 /* Copy address just once */
2744                 if (state->msg && state->msg->msg_name) {
2745                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2746                                          state->msg->msg_name);
2747                         unix_copy_addr(state->msg, skb->sk);
2748                         sunaddr = NULL;
2749                 }
2750
2751                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2752                 skb_get(skb);
2753                 chunk = state->recv_actor(skb, skip, chunk, state);
2754                 drop_skb = !unix_skb_len(skb);
2755                 /* skb is only safe to use if !drop_skb */
2756                 consume_skb(skb);
2757                 if (chunk < 0) {
2758                         if (copied == 0)
2759                                 copied = -EFAULT;
2760                         break;
2761                 }
2762                 copied += chunk;
2763                 size -= chunk;
2764
2765                 if (drop_skb) {
2766                         /* the skb was touched by a concurrent reader;
2767                          * we should not expect anything from this skb
2768                          * anymore and assume it invalid - we can be
2769                          * sure it was dropped from the socket queue
2770                          *
2771                          * let's report a short read
2772                          */
2773                         err = 0;
2774                         break;
2775                 }
2776
2777                 /* Mark read part of skb as used */
2778                 if (!(flags & MSG_PEEK)) {
2779                         UNIXCB(skb).consumed += chunk;
2780
2781                         sk_peek_offset_bwd(sk, chunk);
2782
2783                         if (UNIXCB(skb).fp) {
2784                                 scm_stat_del(sk, skb);
2785                                 unix_detach_fds(&scm, skb);
2786                         }
2787
2788                         if (unix_skb_len(skb))
2789                                 break;
2790
2791                         skb_unlink(skb, &sk->sk_receive_queue);
2792                         consume_skb(skb);
2793
2794                         if (scm.fp)
2795                                 break;
2796                 } else {
2797                         /* It is questionable, see note in unix_dgram_recvmsg.
2798                          */
2799                         if (UNIXCB(skb).fp)
2800                                 unix_peek_fds(&scm, skb);
2801
2802                         sk_peek_offset_fwd(sk, chunk);
2803
2804                         if (UNIXCB(skb).fp)
2805                                 break;
2806
2807                         skip = 0;
2808                         last = skb;
2809                         last_len = skb->len;
2810                         unix_state_lock(sk);
2811                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2812                         if (skb)
2813                                 goto again;
2814                         unix_state_unlock(sk);
2815                         break;
2816                 }
2817         } while (size);
2818
2819         mutex_unlock(&u->iolock);
2820         if (state->msg)
2821                 scm_recv_unix(sock, state->msg, &scm, flags);
2822         else
2823                 scm_destroy(&scm);
2824 out:
2825         return copied ? : err;
2826 }
2827
2828 static int unix_stream_read_actor(struct sk_buff *skb,
2829                                   int skip, int chunk,
2830                                   struct unix_stream_read_state *state)
2831 {
2832         int ret;
2833
2834         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2835                                     state->msg, chunk);
2836         return ret ?: chunk;
2837 }
2838
2839 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2840                           size_t size, int flags)
2841 {
2842         struct unix_stream_read_state state = {
2843                 .recv_actor = unix_stream_read_actor,
2844                 .socket = sk->sk_socket,
2845                 .msg = msg,
2846                 .size = size,
2847                 .flags = flags
2848         };
2849
2850         return unix_stream_read_generic(&state, true);
2851 }
2852
2853 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2854                                size_t size, int flags)
2855 {
2856         struct unix_stream_read_state state = {
2857                 .recv_actor = unix_stream_read_actor,
2858                 .socket = sock,
2859                 .msg = msg,
2860                 .size = size,
2861                 .flags = flags
2862         };
2863
2864 #ifdef CONFIG_BPF_SYSCALL
2865         struct sock *sk = sock->sk;
2866         const struct proto *prot = READ_ONCE(sk->sk_prot);
2867
2868         if (prot != &unix_stream_proto)
2869                 return prot->recvmsg(sk, msg, size, flags, NULL);
2870 #endif
2871         return unix_stream_read_generic(&state, true);
2872 }
2873
2874 static int unix_stream_splice_actor(struct sk_buff *skb,
2875                                     int skip, int chunk,
2876                                     struct unix_stream_read_state *state)
2877 {
2878         return skb_splice_bits(skb, state->socket->sk,
2879                                UNIXCB(skb).consumed + skip,
2880                                state->pipe, chunk, state->splice_flags);
2881 }
2882
2883 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2884                                        struct pipe_inode_info *pipe,
2885                                        size_t size, unsigned int flags)
2886 {
2887         struct unix_stream_read_state state = {
2888                 .recv_actor = unix_stream_splice_actor,
2889                 .socket = sock,
2890                 .pipe = pipe,
2891                 .size = size,
2892                 .splice_flags = flags,
2893         };
2894
2895         if (unlikely(*ppos))
2896                 return -ESPIPE;
2897
2898         if (sock->file->f_flags & O_NONBLOCK ||
2899             flags & SPLICE_F_NONBLOCK)
2900                 state.flags = MSG_DONTWAIT;
2901
2902         return unix_stream_read_generic(&state, false);
2903 }
2904
2905 static int unix_shutdown(struct socket *sock, int mode)
2906 {
2907         struct sock *sk = sock->sk;
2908         struct sock *other;
2909
2910         if (mode < SHUT_RD || mode > SHUT_RDWR)
2911                 return -EINVAL;
2912         /* This maps:
2913          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2914          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2915          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2916          */
2917         ++mode;
2918
2919         unix_state_lock(sk);
2920         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2921         other = unix_peer(sk);
2922         if (other)
2923                 sock_hold(other);
2924         unix_state_unlock(sk);
2925         sk->sk_state_change(sk);
2926
2927         if (other &&
2928                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2929
2930                 int peer_mode = 0;
2931                 const struct proto *prot = READ_ONCE(other->sk_prot);
2932
2933                 if (prot->unhash)
2934                         prot->unhash(other);
2935                 if (mode&RCV_SHUTDOWN)
2936                         peer_mode |= SEND_SHUTDOWN;
2937                 if (mode&SEND_SHUTDOWN)
2938                         peer_mode |= RCV_SHUTDOWN;
2939                 unix_state_lock(other);
2940                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2941                 unix_state_unlock(other);
2942                 other->sk_state_change(other);
2943                 if (peer_mode == SHUTDOWN_MASK)
2944                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2945                 else if (peer_mode & RCV_SHUTDOWN)
2946                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2947         }
2948         if (other)
2949                 sock_put(other);
2950
2951         return 0;
2952 }
2953
2954 long unix_inq_len(struct sock *sk)
2955 {
2956         struct sk_buff *skb;
2957         long amount = 0;
2958
2959         if (sk->sk_state == TCP_LISTEN)
2960                 return -EINVAL;
2961
2962         spin_lock(&sk->sk_receive_queue.lock);
2963         if (sk->sk_type == SOCK_STREAM ||
2964             sk->sk_type == SOCK_SEQPACKET) {
2965                 skb_queue_walk(&sk->sk_receive_queue, skb)
2966                         amount += unix_skb_len(skb);
2967         } else {
2968                 skb = skb_peek(&sk->sk_receive_queue);
2969                 if (skb)
2970                         amount = skb->len;
2971         }
2972         spin_unlock(&sk->sk_receive_queue.lock);
2973
2974         return amount;
2975 }
2976 EXPORT_SYMBOL_GPL(unix_inq_len);
2977
2978 long unix_outq_len(struct sock *sk)
2979 {
2980         return sk_wmem_alloc_get(sk);
2981 }
2982 EXPORT_SYMBOL_GPL(unix_outq_len);
2983
2984 static int unix_open_file(struct sock *sk)
2985 {
2986         struct path path;
2987         struct file *f;
2988         int fd;
2989
2990         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2991                 return -EPERM;
2992
2993         if (!smp_load_acquire(&unix_sk(sk)->addr))
2994                 return -ENOENT;
2995
2996         path = unix_sk(sk)->path;
2997         if (!path.dentry)
2998                 return -ENOENT;
2999
3000         path_get(&path);
3001
3002         fd = get_unused_fd_flags(O_CLOEXEC);
3003         if (fd < 0)
3004                 goto out;
3005
3006         f = dentry_open(&path, O_PATH, current_cred());
3007         if (IS_ERR(f)) {
3008                 put_unused_fd(fd);
3009                 fd = PTR_ERR(f);
3010                 goto out;
3011         }
3012
3013         fd_install(fd, f);
3014 out:
3015         path_put(&path);
3016
3017         return fd;
3018 }
3019
3020 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3021 {
3022         struct sock *sk = sock->sk;
3023         long amount = 0;
3024         int err;
3025
3026         switch (cmd) {
3027         case SIOCOUTQ:
3028                 amount = unix_outq_len(sk);
3029                 err = put_user(amount, (int __user *)arg);
3030                 break;
3031         case SIOCINQ:
3032                 amount = unix_inq_len(sk);
3033                 if (amount < 0)
3034                         err = amount;
3035                 else
3036                         err = put_user(amount, (int __user *)arg);
3037                 break;
3038         case SIOCUNIXFILE:
3039                 err = unix_open_file(sk);
3040                 break;
3041 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3042         case SIOCATMARK:
3043                 {
3044                         struct sk_buff *skb;
3045                         int answ = 0;
3046
3047                         skb = skb_peek(&sk->sk_receive_queue);
3048                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3049                                 answ = 1;
3050                         err = put_user(answ, (int __user *)arg);
3051                 }
3052                 break;
3053 #endif
3054         default:
3055                 err = -ENOIOCTLCMD;
3056                 break;
3057         }
3058         return err;
3059 }
3060
3061 #ifdef CONFIG_COMPAT
3062 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3063 {
3064         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3065 }
3066 #endif
3067
3068 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3069 {
3070         struct sock *sk = sock->sk;
3071         __poll_t mask;
3072         u8 shutdown;
3073
3074         sock_poll_wait(file, sock, wait);
3075         mask = 0;
3076         shutdown = READ_ONCE(sk->sk_shutdown);
3077
3078         /* exceptional events? */
3079         if (READ_ONCE(sk->sk_err))
3080                 mask |= EPOLLERR;
3081         if (shutdown == SHUTDOWN_MASK)
3082                 mask |= EPOLLHUP;
3083         if (shutdown & RCV_SHUTDOWN)
3084                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3085
3086         /* readable? */
3087         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3088                 mask |= EPOLLIN | EPOLLRDNORM;
3089         if (sk_is_readable(sk))
3090                 mask |= EPOLLIN | EPOLLRDNORM;
3091 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3092         if (READ_ONCE(unix_sk(sk)->oob_skb))
3093                 mask |= EPOLLPRI;
3094 #endif
3095
3096         /* Connection-based need to check for termination and startup */
3097         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3098             sk->sk_state == TCP_CLOSE)
3099                 mask |= EPOLLHUP;
3100
3101         /*
3102          * we set writable also when the other side has shut down the
3103          * connection. This prevents stuck sockets.
3104          */
3105         if (unix_writable(sk))
3106                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3107
3108         return mask;
3109 }
3110
3111 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3112                                     poll_table *wait)
3113 {
3114         struct sock *sk = sock->sk, *other;
3115         unsigned int writable;
3116         __poll_t mask;
3117         u8 shutdown;
3118
3119         sock_poll_wait(file, sock, wait);
3120         mask = 0;
3121         shutdown = READ_ONCE(sk->sk_shutdown);
3122
3123         /* exceptional events? */
3124         if (READ_ONCE(sk->sk_err) ||
3125             !skb_queue_empty_lockless(&sk->sk_error_queue))
3126                 mask |= EPOLLERR |
3127                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3128
3129         if (shutdown & RCV_SHUTDOWN)
3130                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3131         if (shutdown == SHUTDOWN_MASK)
3132                 mask |= EPOLLHUP;
3133
3134         /* readable? */
3135         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3136                 mask |= EPOLLIN | EPOLLRDNORM;
3137         if (sk_is_readable(sk))
3138                 mask |= EPOLLIN | EPOLLRDNORM;
3139
3140         /* Connection-based need to check for termination and startup */
3141         if (sk->sk_type == SOCK_SEQPACKET) {
3142                 if (sk->sk_state == TCP_CLOSE)
3143                         mask |= EPOLLHUP;
3144                 /* connection hasn't started yet? */
3145                 if (sk->sk_state == TCP_SYN_SENT)
3146                         return mask;
3147         }
3148
3149         /* No write status requested, avoid expensive OUT tests. */
3150         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3151                 return mask;
3152
3153         writable = unix_writable(sk);
3154         if (writable) {
3155                 unix_state_lock(sk);
3156
3157                 other = unix_peer(sk);
3158                 if (other && unix_peer(other) != sk &&
3159                     unix_recvq_full_lockless(other) &&
3160                     unix_dgram_peer_wake_me(sk, other))
3161                         writable = 0;
3162
3163                 unix_state_unlock(sk);
3164         }
3165
3166         if (writable)
3167                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3168         else
3169                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3170
3171         return mask;
3172 }
3173
3174 #ifdef CONFIG_PROC_FS
3175
3176 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3177
3178 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3179 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3180 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3181
3182 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3183 {
3184         unsigned long offset = get_offset(*pos);
3185         unsigned long bucket = get_bucket(*pos);
3186         unsigned long count = 0;
3187         struct sock *sk;
3188
3189         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3190              sk; sk = sk_next(sk)) {
3191                 if (++count == offset)
3192                         break;
3193         }
3194
3195         return sk;
3196 }
3197
3198 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3199 {
3200         unsigned long bucket = get_bucket(*pos);
3201         struct net *net = seq_file_net(seq);
3202         struct sock *sk;
3203
3204         while (bucket < UNIX_HASH_SIZE) {
3205                 spin_lock(&net->unx.table.locks[bucket]);
3206
3207                 sk = unix_from_bucket(seq, pos);
3208                 if (sk)
3209                         return sk;
3210
3211                 spin_unlock(&net->unx.table.locks[bucket]);
3212
3213                 *pos = set_bucket_offset(++bucket, 1);
3214         }
3215
3216         return NULL;
3217 }
3218
3219 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3220                                   loff_t *pos)
3221 {
3222         unsigned long bucket = get_bucket(*pos);
3223
3224         sk = sk_next(sk);
3225         if (sk)
3226                 return sk;
3227
3228
3229         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3230
3231         *pos = set_bucket_offset(++bucket, 1);
3232
3233         return unix_get_first(seq, pos);
3234 }
3235
3236 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3237 {
3238         if (!*pos)
3239                 return SEQ_START_TOKEN;
3240
3241         return unix_get_first(seq, pos);
3242 }
3243
3244 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3245 {
3246         ++*pos;
3247
3248         if (v == SEQ_START_TOKEN)
3249                 return unix_get_first(seq, pos);
3250
3251         return unix_get_next(seq, v, pos);
3252 }
3253
3254 static void unix_seq_stop(struct seq_file *seq, void *v)
3255 {
3256         struct sock *sk = v;
3257
3258         if (sk)
3259                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3260 }
3261
3262 static int unix_seq_show(struct seq_file *seq, void *v)
3263 {
3264
3265         if (v == SEQ_START_TOKEN)
3266                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3267                          "Inode Path\n");
3268         else {
3269                 struct sock *s = v;
3270                 struct unix_sock *u = unix_sk(s);
3271                 unix_state_lock(s);
3272
3273                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3274                         s,
3275                         refcount_read(&s->sk_refcnt),
3276                         0,
3277                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3278                         s->sk_type,
3279                         s->sk_socket ?
3280                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3281                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3282                         sock_i_ino(s));
3283
3284                 if (u->addr) {  // under a hash table lock here
3285                         int i, len;
3286                         seq_putc(seq, ' ');
3287
3288                         i = 0;
3289                         len = u->addr->len -
3290                                 offsetof(struct sockaddr_un, sun_path);
3291                         if (u->addr->name->sun_path[0]) {
3292                                 len--;
3293                         } else {
3294                                 seq_putc(seq, '@');
3295                                 i++;
3296                         }
3297                         for ( ; i < len; i++)
3298                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3299                                          '@');
3300                 }
3301                 unix_state_unlock(s);
3302                 seq_putc(seq, '\n');
3303         }
3304
3305         return 0;
3306 }
3307
3308 static const struct seq_operations unix_seq_ops = {
3309         .start  = unix_seq_start,
3310         .next   = unix_seq_next,
3311         .stop   = unix_seq_stop,
3312         .show   = unix_seq_show,
3313 };
3314
3315 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3316 struct bpf_unix_iter_state {
3317         struct seq_net_private p;
3318         unsigned int cur_sk;
3319         unsigned int end_sk;
3320         unsigned int max_sk;
3321         struct sock **batch;
3322         bool st_bucket_done;
3323 };
3324
3325 struct bpf_iter__unix {
3326         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3327         __bpf_md_ptr(struct unix_sock *, unix_sk);
3328         uid_t uid __aligned(8);
3329 };
3330
3331 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3332                               struct unix_sock *unix_sk, uid_t uid)
3333 {
3334         struct bpf_iter__unix ctx;
3335
3336         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3337         ctx.meta = meta;
3338         ctx.unix_sk = unix_sk;
3339         ctx.uid = uid;
3340         return bpf_iter_run_prog(prog, &ctx);
3341 }
3342
3343 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3344
3345 {
3346         struct bpf_unix_iter_state *iter = seq->private;
3347         unsigned int expected = 1;
3348         struct sock *sk;
3349
3350         sock_hold(start_sk);
3351         iter->batch[iter->end_sk++] = start_sk;
3352
3353         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3354                 if (iter->end_sk < iter->max_sk) {
3355                         sock_hold(sk);
3356                         iter->batch[iter->end_sk++] = sk;
3357                 }
3358
3359                 expected++;
3360         }
3361
3362         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3363
3364         return expected;
3365 }
3366
3367 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3368 {
3369         while (iter->cur_sk < iter->end_sk)
3370                 sock_put(iter->batch[iter->cur_sk++]);
3371 }
3372
3373 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3374                                        unsigned int new_batch_sz)
3375 {
3376         struct sock **new_batch;
3377
3378         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3379                              GFP_USER | __GFP_NOWARN);
3380         if (!new_batch)
3381                 return -ENOMEM;
3382
3383         bpf_iter_unix_put_batch(iter);
3384         kvfree(iter->batch);
3385         iter->batch = new_batch;
3386         iter->max_sk = new_batch_sz;
3387
3388         return 0;
3389 }
3390
3391 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3392                                         loff_t *pos)
3393 {
3394         struct bpf_unix_iter_state *iter = seq->private;
3395         unsigned int expected;
3396         bool resized = false;
3397         struct sock *sk;
3398
3399         if (iter->st_bucket_done)
3400                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3401
3402 again:
3403         /* Get a new batch */
3404         iter->cur_sk = 0;
3405         iter->end_sk = 0;
3406
3407         sk = unix_get_first(seq, pos);
3408         if (!sk)
3409                 return NULL; /* Done */
3410
3411         expected = bpf_iter_unix_hold_batch(seq, sk);
3412
3413         if (iter->end_sk == expected) {
3414                 iter->st_bucket_done = true;
3415                 return sk;
3416         }
3417
3418         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3419                 resized = true;
3420                 goto again;
3421         }
3422
3423         return sk;
3424 }
3425
3426 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3427 {
3428         if (!*pos)
3429                 return SEQ_START_TOKEN;
3430
3431         /* bpf iter does not support lseek, so it always
3432          * continue from where it was stop()-ped.
3433          */
3434         return bpf_iter_unix_batch(seq, pos);
3435 }
3436
3437 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3438 {
3439         struct bpf_unix_iter_state *iter = seq->private;
3440         struct sock *sk;
3441
3442         /* Whenever seq_next() is called, the iter->cur_sk is
3443          * done with seq_show(), so advance to the next sk in
3444          * the batch.
3445          */
3446         if (iter->cur_sk < iter->end_sk)
3447                 sock_put(iter->batch[iter->cur_sk++]);
3448
3449         ++*pos;
3450
3451         if (iter->cur_sk < iter->end_sk)
3452                 sk = iter->batch[iter->cur_sk];
3453         else
3454                 sk = bpf_iter_unix_batch(seq, pos);
3455
3456         return sk;
3457 }
3458
3459 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3460 {
3461         struct bpf_iter_meta meta;
3462         struct bpf_prog *prog;
3463         struct sock *sk = v;
3464         uid_t uid;
3465         bool slow;
3466         int ret;
3467
3468         if (v == SEQ_START_TOKEN)
3469                 return 0;
3470
3471         slow = lock_sock_fast(sk);
3472
3473         if (unlikely(sk_unhashed(sk))) {
3474                 ret = SEQ_SKIP;
3475                 goto unlock;
3476         }
3477
3478         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3479         meta.seq = seq;
3480         prog = bpf_iter_get_info(&meta, false);
3481         ret = unix_prog_seq_show(prog, &meta, v, uid);
3482 unlock:
3483         unlock_sock_fast(sk, slow);
3484         return ret;
3485 }
3486
3487 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3488 {
3489         struct bpf_unix_iter_state *iter = seq->private;
3490         struct bpf_iter_meta meta;
3491         struct bpf_prog *prog;
3492
3493         if (!v) {
3494                 meta.seq = seq;
3495                 prog = bpf_iter_get_info(&meta, true);
3496                 if (prog)
3497                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3498         }
3499
3500         if (iter->cur_sk < iter->end_sk)
3501                 bpf_iter_unix_put_batch(iter);
3502 }
3503
3504 static const struct seq_operations bpf_iter_unix_seq_ops = {
3505         .start  = bpf_iter_unix_seq_start,
3506         .next   = bpf_iter_unix_seq_next,
3507         .stop   = bpf_iter_unix_seq_stop,
3508         .show   = bpf_iter_unix_seq_show,
3509 };
3510 #endif
3511 #endif
3512
3513 static const struct net_proto_family unix_family_ops = {
3514         .family = PF_UNIX,
3515         .create = unix_create,
3516         .owner  = THIS_MODULE,
3517 };
3518
3519
3520 static int __net_init unix_net_init(struct net *net)
3521 {
3522         int i;
3523
3524         net->unx.sysctl_max_dgram_qlen = 10;
3525         if (unix_sysctl_register(net))
3526                 goto out;
3527
3528 #ifdef CONFIG_PROC_FS
3529         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3530                              sizeof(struct seq_net_private)))
3531                 goto err_sysctl;
3532 #endif
3533
3534         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3535                                               sizeof(spinlock_t), GFP_KERNEL);
3536         if (!net->unx.table.locks)
3537                 goto err_proc;
3538
3539         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3540                                                 sizeof(struct hlist_head),
3541                                                 GFP_KERNEL);
3542         if (!net->unx.table.buckets)
3543                 goto free_locks;
3544
3545         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3546                 spin_lock_init(&net->unx.table.locks[i]);
3547                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3548         }
3549
3550         return 0;
3551
3552 free_locks:
3553         kvfree(net->unx.table.locks);
3554 err_proc:
3555 #ifdef CONFIG_PROC_FS
3556         remove_proc_entry("unix", net->proc_net);
3557 err_sysctl:
3558 #endif
3559         unix_sysctl_unregister(net);
3560 out:
3561         return -ENOMEM;
3562 }
3563
3564 static void __net_exit unix_net_exit(struct net *net)
3565 {
3566         kvfree(net->unx.table.buckets);
3567         kvfree(net->unx.table.locks);
3568         unix_sysctl_unregister(net);
3569         remove_proc_entry("unix", net->proc_net);
3570 }
3571
3572 static struct pernet_operations unix_net_ops = {
3573         .init = unix_net_init,
3574         .exit = unix_net_exit,
3575 };
3576
3577 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3578 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3579                      struct unix_sock *unix_sk, uid_t uid)
3580
3581 #define INIT_BATCH_SZ 16
3582
3583 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3584 {
3585         struct bpf_unix_iter_state *iter = priv_data;
3586         int err;
3587
3588         err = bpf_iter_init_seq_net(priv_data, aux);
3589         if (err)
3590                 return err;
3591
3592         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3593         if (err) {
3594                 bpf_iter_fini_seq_net(priv_data);
3595                 return err;
3596         }
3597
3598         return 0;
3599 }
3600
3601 static void bpf_iter_fini_unix(void *priv_data)
3602 {
3603         struct bpf_unix_iter_state *iter = priv_data;
3604
3605         bpf_iter_fini_seq_net(priv_data);
3606         kvfree(iter->batch);
3607 }
3608
3609 static const struct bpf_iter_seq_info unix_seq_info = {
3610         .seq_ops                = &bpf_iter_unix_seq_ops,
3611         .init_seq_private       = bpf_iter_init_unix,
3612         .fini_seq_private       = bpf_iter_fini_unix,
3613         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3614 };
3615
3616 static const struct bpf_func_proto *
3617 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3618                              const struct bpf_prog *prog)
3619 {
3620         switch (func_id) {
3621         case BPF_FUNC_setsockopt:
3622                 return &bpf_sk_setsockopt_proto;
3623         case BPF_FUNC_getsockopt:
3624                 return &bpf_sk_getsockopt_proto;
3625         default:
3626                 return NULL;
3627         }
3628 }
3629
3630 static struct bpf_iter_reg unix_reg_info = {
3631         .target                 = "unix",
3632         .ctx_arg_info_size      = 1,
3633         .ctx_arg_info           = {
3634                 { offsetof(struct bpf_iter__unix, unix_sk),
3635                   PTR_TO_BTF_ID_OR_NULL },
3636         },
3637         .get_func_proto         = bpf_iter_unix_get_func_proto,
3638         .seq_info               = &unix_seq_info,
3639 };
3640
3641 static void __init bpf_iter_register(void)
3642 {
3643         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3644         if (bpf_iter_reg_target(&unix_reg_info))
3645                 pr_warn("Warning: could not register bpf iterator unix\n");
3646 }
3647 #endif
3648
3649 static int __init af_unix_init(void)
3650 {
3651         int i, rc = -1;
3652
3653         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3654
3655         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3656                 spin_lock_init(&bsd_socket_locks[i]);
3657                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3658         }
3659
3660         rc = proto_register(&unix_dgram_proto, 1);
3661         if (rc != 0) {
3662                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3663                 goto out;
3664         }
3665
3666         rc = proto_register(&unix_stream_proto, 1);
3667         if (rc != 0) {
3668                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3669                 proto_unregister(&unix_dgram_proto);
3670                 goto out;
3671         }
3672
3673         sock_register(&unix_family_ops);
3674         register_pernet_subsys(&unix_net_ops);
3675         unix_bpf_build_proto();
3676
3677 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3678         bpf_iter_register();
3679 #endif
3680
3681 out:
3682         return rc;
3683 }
3684
3685 static void __exit af_unix_exit(void)
3686 {
3687         sock_unregister(PF_UNIX);
3688         proto_unregister(&unix_dgram_proto);
3689         proto_unregister(&unix_stream_proto);
3690         unregister_pernet_subsys(&unix_net_ops);
3691 }
3692
3693 /* Earlier than device_initcall() so that other drivers invoking
3694    request_module() don't end up in a loop when modprobe tries
3695    to use a UNIX socket. But later than subsys_initcall() because
3696    we depend on stuff initialised there */
3697 fs_initcall(af_unix_init);
3698 module_exit(af_unix_exit);
3699
3700 MODULE_LICENSE("GPL");
3701 MODULE_ALIAS_NETPROTO(PF_UNIX);