net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119
 120 #include "scm.h"
 121
 122 static atomic_long_t unix_nr_socks;
 123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 125
 126 /* SMP locking strategy:
 127  *    hash table is protected with spinlock.
 128  *    each socket state is protected by separate spinlock.
 129  */
 130
 131 static unsigned int unix_unbound_hash(struct sock *sk)
 132 {
 133         unsigned long hash = (unsigned long)sk;
 134
 135         hash ^= hash >> 16;
 136         hash ^= hash >> 8;
 137         hash ^= sk->sk_type;
 138
 139         return hash & UNIX_HASH_MOD;
 140 }
 141
 142 static unsigned int unix_bsd_hash(struct inode *i)
 143 {
 144         return i->i_ino & UNIX_HASH_MOD;
 145 }
 146
 147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 148                                        int addr_len, int type)
 149 {
 150         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 151         unsigned int hash;
 152
 153         hash = (__force unsigned int)csum_fold(csum);
 154         hash ^= hash >> 8;
 155         hash ^= type;
 156
 157         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 158 }
 159
 160 static void unix_table_double_lock(struct net *net,
 161                                    unsigned int hash1, unsigned int hash2)
 162 {
 163         if (hash1 == hash2) {
 164                 spin_lock(&net->unx.table.locks[hash1]);
 165                 return;
 166         }
 167
 168         if (hash1 > hash2)
 169                 swap(hash1, hash2);
 170
 171         spin_lock(&net->unx.table.locks[hash1]);
 172         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 173 }
 174
 175 static void unix_table_double_unlock(struct net *net,
 176                                      unsigned int hash1, unsigned int hash2)
 177 {
 178         if (hash1 == hash2) {
 179                 spin_unlock(&net->unx.table.locks[hash1]);
 180                 return;
 181         }
 182
 183         spin_unlock(&net->unx.table.locks[hash1]);
 184         spin_unlock(&net->unx.table.locks[hash2]);
 185 }
 186
 187 #ifdef CONFIG_SECURITY_NETWORK
 188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 189 {
 190         UNIXCB(skb).secid = scm->secid;
 191 }
 192
 193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 194 {
 195         scm->secid = UNIXCB(skb).secid;
 196 }
 197
 198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 199 {
 200         return (scm->secid == UNIXCB(skb).secid);
 201 }
 202 #else
 203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 204 { }
 205
 206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 207 { }
 208
 209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 210 {
 211         return true;
 212 }
 213 #endif /* CONFIG_SECURITY_NETWORK */
 214
 215 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 216 {
 217         return unix_peer(osk) == sk;
 218 }
 219
 220 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 221 {
 222         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 223 }
 224
 225 static inline int unix_recvq_full(const struct sock *sk)
 226 {
 227         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 228 }
 229
 230 static inline int unix_recvq_full_lockless(const struct sock *sk)
 231 {
 232         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 233                 READ_ONCE(sk->sk_max_ack_backlog);
 234 }
 235
 236 struct sock *unix_peer_get(struct sock *s)
 237 {
 238         struct sock *peer;
 239
 240         unix_state_lock(s);
 241         peer = unix_peer(s);
 242         if (peer)
 243                 sock_hold(peer);
 244         unix_state_unlock(s);
 245         return peer;
 246 }
 247 EXPORT_SYMBOL_GPL(unix_peer_get);
 248
 249 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 250                                              int addr_len)
 251 {
 252         struct unix_address *addr;
 253
 254         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 255         if (!addr)
 256                 return NULL;
 257
 258         refcount_set(&addr->refcnt, 1);
 259         addr->len = addr_len;
 260         memcpy(addr->name, sunaddr, addr_len);
 261
 262         return addr;
 263 }
 264
 265 static inline void unix_release_addr(struct unix_address *addr)
 266 {
 267         if (refcount_dec_and_test(&addr->refcnt))
 268                 kfree(addr);
 269 }
 270
 271 /*
 272  *      Check unix socket name:
 273  *              - should be not zero length.
 274  *              - if started by not zero, should be NULL terminated (FS object)
 275  *              - if started by zero, it is abstract name.
 276  */
 277
 278 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 279 {
 280         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 281             addr_len > sizeof(*sunaddr))
 282                 return -EINVAL;
 283
 284         if (sunaddr->sun_family != AF_UNIX)
 285                 return -EINVAL;
 286
 287         return 0;
 288 }
 289
 290 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 291 {
 292         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 293         short offset = offsetof(struct sockaddr_storage, __data);
 294
 295         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 296
 297         /* This may look like an off by one error but it is a bit more
 298          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 299          * sun_path[108] doesn't as such exist.  However in kernel space
 300          * we are guaranteed that it is a valid memory location in our
 301          * kernel address buffer because syscall functions always pass
 302          * a pointer of struct sockaddr_storage which has a bigger buffer
 303          * than 108.  Also, we must terminate sun_path for strlen() in
 304          * getname_kernel().
 305          */
 306         addr->__data[addr_len - offset] = 0;
 307
 308         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 309          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 310          * know the actual buffer.
 311          */
 312         return strlen(addr->__data) + offset + 1;
 313 }
 314
 315 static void __unix_remove_socket(struct sock *sk)
 316 {
 317         sk_del_node_init(sk);
 318 }
 319
 320 static void __unix_insert_socket(struct net *net, struct sock *sk)
 321 {
 322         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 323         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 324 }
 325
 326 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 327                                  struct unix_address *addr, unsigned int hash)
 328 {
 329         __unix_remove_socket(sk);
 330         smp_store_release(&unix_sk(sk)->addr, addr);
 331
 332         sk->sk_hash = hash;
 333         __unix_insert_socket(net, sk);
 334 }
 335
 336 static void unix_remove_socket(struct net *net, struct sock *sk)
 337 {
 338         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 339         __unix_remove_socket(sk);
 340         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 341 }
 342
 343 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 344 {
 345         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 346         __unix_insert_socket(net, sk);
 347         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 348 }
 349
 350 static void unix_insert_bsd_socket(struct sock *sk)
 351 {
 352         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 353         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 354         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 355 }
 356
 357 static void unix_remove_bsd_socket(struct sock *sk)
 358 {
 359         if (!hlist_unhashed(&sk->sk_bind_node)) {
 360                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 361                 __sk_del_bind_node(sk);
 362                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 363
 364                 sk_node_init(&sk->sk_bind_node);
 365         }
 366 }
 367
 368 static struct sock *__unix_find_socket_byname(struct net *net,
 369                                               struct sockaddr_un *sunname,
 370                                               int len, unsigned int hash)
 371 {
 372         struct sock *s;
 373
 374         sk_for_each(s, &net->unx.table.buckets[hash]) {
 375                 struct unix_sock *u = unix_sk(s);
 376
 377                 if (u->addr->len == len &&
 378                     !memcmp(u->addr->name, sunname, len))
 379                         return s;
 380         }
 381         return NULL;
 382 }
 383
 384 static inline struct sock *unix_find_socket_byname(struct net *net,
 385                                                    struct sockaddr_un *sunname,
 386                                                    int len, unsigned int hash)
 387 {
 388         struct sock *s;
 389
 390         spin_lock(&net->unx.table.locks[hash]);
 391         s = __unix_find_socket_byname(net, sunname, len, hash);
 392         if (s)
 393                 sock_hold(s);
 394         spin_unlock(&net->unx.table.locks[hash]);
 395         return s;
 396 }
 397
 398 static struct sock *unix_find_socket_byinode(struct inode *i)
 399 {
 400         unsigned int hash = unix_bsd_hash(i);
 401         struct sock *s;
 402
 403         spin_lock(&bsd_socket_locks[hash]);
 404         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 405                 struct dentry *dentry = unix_sk(s)->path.dentry;
 406
 407                 if (dentry && d_backing_inode(dentry) == i) {
 408                         sock_hold(s);
 409                         spin_unlock(&bsd_socket_locks[hash]);
 410                         return s;
 411                 }
 412         }
 413         spin_unlock(&bsd_socket_locks[hash]);
 414         return NULL;
 415 }
 416
 417 /* Support code for asymmetrically connected dgram sockets
 418  *
 419  * If a datagram socket is connected to a socket not itself connected
 420  * to the first socket (eg, /dev/log), clients may only enqueue more
 421  * messages if the present receive queue of the server socket is not
 422  * "too large". This means there's a second writeability condition
 423  * poll and sendmsg need to test. The dgram recv code will do a wake
 424  * up on the peer_wait wait queue of a socket upon reception of a
 425  * datagram which needs to be propagated to sleeping would-be writers
 426  * since these might not have sent anything so far. This can't be
 427  * accomplished via poll_wait because the lifetime of the server
 428  * socket might be less than that of its clients if these break their
 429  * association with it or if the server socket is closed while clients
 430  * are still connected to it and there's no way to inform "a polling
 431  * implementation" that it should let go of a certain wait queue
 432  *
 433  * In order to propagate a wake up, a wait_queue_entry_t of the client
 434  * socket is enqueued on the peer_wait queue of the server socket
 435  * whose wake function does a wake_up on the ordinary client socket
 436  * wait queue. This connection is established whenever a write (or
 437  * poll for write) hit the flow control condition and broken when the
 438  * association to the server socket is dissolved or after a wake up
 439  * was relayed.
 440  */
 441
 442 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 443                                       void *key)
 444 {
 445         struct unix_sock *u;
 446         wait_queue_head_t *u_sleep;
 447
 448         u = container_of(q, struct unix_sock, peer_wake);
 449
 450         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 451                             q);
 452         u->peer_wake.private = NULL;
 453
 454         /* relaying can only happen while the wq still exists */
 455         u_sleep = sk_sleep(&u->sk);
 456         if (u_sleep)
 457                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 458
 459         return 0;
 460 }
 461
 462 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 463 {
 464         struct unix_sock *u, *u_other;
 465         int rc;
 466
 467         u = unix_sk(sk);
 468         u_other = unix_sk(other);
 469         rc = 0;
 470         spin_lock(&u_other->peer_wait.lock);
 471
 472         if (!u->peer_wake.private) {
 473                 u->peer_wake.private = other;
 474                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 475
 476                 rc = 1;
 477         }
 478
 479         spin_unlock(&u_other->peer_wait.lock);
 480         return rc;
 481 }
 482
 483 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 484                                             struct sock *other)
 485 {
 486         struct unix_sock *u, *u_other;
 487
 488         u = unix_sk(sk);
 489         u_other = unix_sk(other);
 490         spin_lock(&u_other->peer_wait.lock);
 491
 492         if (u->peer_wake.private == other) {
 493                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 494                 u->peer_wake.private = NULL;
 495         }
 496
 497         spin_unlock(&u_other->peer_wait.lock);
 498 }
 499
 500 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 501                                                    struct sock *other)
 502 {
 503         unix_dgram_peer_wake_disconnect(sk, other);
 504         wake_up_interruptible_poll(sk_sleep(sk),
 505                                    EPOLLOUT |
 506                                    EPOLLWRNORM |
 507                                    EPOLLWRBAND);
 508 }
 509
 510 /* preconditions:
 511  *      - unix_peer(sk) == other
 512  *      - association is stable
 513  */
 514 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 515 {
 516         int connected;
 517
 518         connected = unix_dgram_peer_wake_connect(sk, other);
 519
 520         /* If other is SOCK_DEAD, we want to make sure we signal
 521          * POLLOUT, such that a subsequent write() can get a
 522          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 523          * to other and its full, we will hang waiting for POLLOUT.
 524          */
 525         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 526                 return 1;
 527
 528         if (connected)
 529                 unix_dgram_peer_wake_disconnect(sk, other);
 530
 531         return 0;
 532 }
 533
 534 static int unix_writable(const struct sock *sk)
 535 {
 536         return sk->sk_state != TCP_LISTEN &&
 537                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 538 }
 539
 540 static void unix_write_space(struct sock *sk)
 541 {
 542         struct socket_wq *wq;
 543
 544         rcu_read_lock();
 545         if (unix_writable(sk)) {
 546                 wq = rcu_dereference(sk->sk_wq);
 547                 if (skwq_has_sleeper(wq))
 548                         wake_up_interruptible_sync_poll(&wq->wait,
 549                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 550                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 551         }
 552         rcu_read_unlock();
 553 }
 554
 555 /* When dgram socket disconnects (or changes its peer), we clear its receive
 556  * queue of packets arrived from previous peer. First, it allows to do
 557  * flow control based only on wmem_alloc; second, sk connected to peer
 558  * may receive messages only from that peer. */
 559 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 560 {
 561         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 562                 skb_queue_purge(&sk->sk_receive_queue);
 563                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 564
 565                 /* If one link of bidirectional dgram pipe is disconnected,
 566                  * we signal error. Messages are lost. Do not make this,
 567                  * when peer was not connected to us.
 568                  */
 569                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 570                         WRITE_ONCE(other->sk_err, ECONNRESET);
 571                         sk_error_report(other);
 572                 }
 573         }
 574         other->sk_state = TCP_CLOSE;
 575 }
 576
 577 static void unix_sock_destructor(struct sock *sk)
 578 {
 579         struct unix_sock *u = unix_sk(sk);
 580
 581         skb_queue_purge(&sk->sk_receive_queue);
 582
 583         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 584         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 585         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 586         if (!sock_flag(sk, SOCK_DEAD)) {
 587                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 588                 return;
 589         }
 590
 591         if (u->addr)
 592                 unix_release_addr(u->addr);
 593
 594         atomic_long_dec(&unix_nr_socks);
 595         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 596 #ifdef UNIX_REFCNT_DEBUG
 597         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 598                 atomic_long_read(&unix_nr_socks));
 599 #endif
 600 }
 601
 602 static void unix_release_sock(struct sock *sk, int embrion)
 603 {
 604         struct unix_sock *u = unix_sk(sk);
 605         struct sock *skpair;
 606         struct sk_buff *skb;
 607         struct path path;
 608         int state;
 609
 610         unix_remove_socket(sock_net(sk), sk);
 611         unix_remove_bsd_socket(sk);
 612
 613         /* Clear state */
 614         unix_state_lock(sk);
 615         sock_orphan(sk);
 616         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 617         path         = u->path;
 618         u->path.dentry = NULL;
 619         u->path.mnt = NULL;
 620         state = sk->sk_state;
 621         sk->sk_state = TCP_CLOSE;
 622
 623         skpair = unix_peer(sk);
 624         unix_peer(sk) = NULL;
 625
 626         unix_state_unlock(sk);
 627
 628 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 629         if (u->oob_skb) {
 630                 kfree_skb(u->oob_skb);
 631                 u->oob_skb = NULL;
 632         }
 633 #endif
 634
 635         wake_up_interruptible_all(&u->peer_wait);
 636
 637         if (skpair != NULL) {
 638                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 639                         unix_state_lock(skpair);
 640                         /* No more writes */
 641                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 642                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 643                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 644                         unix_state_unlock(skpair);
 645                         skpair->sk_state_change(skpair);
 646                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 647                 }
 648
 649                 unix_dgram_peer_wake_disconnect(sk, skpair);
 650                 sock_put(skpair); /* It may now die */
 651         }
 652
 653         /* Try to flush out this socket. Throw out buffers at least */
 654
 655         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 656                 if (state == TCP_LISTEN)
 657                         unix_release_sock(skb->sk, 1);
 658                 /* passed fds are erased in the kfree_skb hook        */
 659                 UNIXCB(skb).consumed = skb->len;
 660                 kfree_skb(skb);
 661         }
 662
 663         if (path.dentry)
 664                 path_put(&path);
 665
 666         sock_put(sk);
 667
 668         /* ---- Socket is dead now and most probably destroyed ---- */
 669
 670         /*
 671          * Fixme: BSD difference: In BSD all sockets connected to us get
 672          *        ECONNRESET and we die on the spot. In Linux we behave
 673          *        like files and pipes do and wait for the last
 674          *        dereference.
 675          *
 676          * Can't we simply set sock->err?
 677          *
 678          *        What the above comment does talk about? --ANK(980817)
 679          */
 680
 681         if (READ_ONCE(unix_tot_inflight))
 682                 unix_gc();              /* Garbage collect fds */
 683 }
 684
 685 static void init_peercred(struct sock *sk)
 686 {
 687         const struct cred *old_cred;
 688         struct pid *old_pid;
 689
 690         spin_lock(&sk->sk_peer_lock);
 691         old_pid = sk->sk_peer_pid;
 692         old_cred = sk->sk_peer_cred;
 693         sk->sk_peer_pid  = get_pid(task_tgid(current));
 694         sk->sk_peer_cred = get_current_cred();
 695         spin_unlock(&sk->sk_peer_lock);
 696
 697         put_pid(old_pid);
 698         put_cred(old_cred);
 699 }
 700
 701 static void copy_peercred(struct sock *sk, struct sock *peersk)
 702 {
 703         const struct cred *old_cred;
 704         struct pid *old_pid;
 705
 706         if (sk < peersk) {
 707                 spin_lock(&sk->sk_peer_lock);
 708                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 709         } else {
 710                 spin_lock(&peersk->sk_peer_lock);
 711                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 712         }
 713         old_pid = sk->sk_peer_pid;
 714         old_cred = sk->sk_peer_cred;
 715         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 716         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 717
 718         spin_unlock(&sk->sk_peer_lock);
 719         spin_unlock(&peersk->sk_peer_lock);
 720
 721         put_pid(old_pid);
 722         put_cred(old_cred);
 723 }
 724
 725 static int unix_listen(struct socket *sock, int backlog)
 726 {
 727         int err;
 728         struct sock *sk = sock->sk;
 729         struct unix_sock *u = unix_sk(sk);
 730
 731         err = -EOPNOTSUPP;
 732         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 733                 goto out;       /* Only stream/seqpacket sockets accept */
 734         err = -EINVAL;
 735         if (!u->addr)
 736                 goto out;       /* No listens on an unbound socket */
 737         unix_state_lock(sk);
 738         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 739                 goto out_unlock;
 740         if (backlog > sk->sk_max_ack_backlog)
 741                 wake_up_interruptible_all(&u->peer_wait);
 742         sk->sk_max_ack_backlog  = backlog;
 743         sk->sk_state            = TCP_LISTEN;
 744         /* set credentials so connect can copy them */
 745         init_peercred(sk);
 746         err = 0;
 747
 748 out_unlock:
 749         unix_state_unlock(sk);
 750 out:
 751         return err;
 752 }
 753
 754 static int unix_release(struct socket *);
 755 static int unix_bind(struct socket *, struct sockaddr *, int);
 756 static int unix_stream_connect(struct socket *, struct sockaddr *,
 757                                int addr_len, int flags);
 758 static int unix_socketpair(struct socket *, struct socket *);
 759 static int unix_accept(struct socket *, struct socket *, int, bool);
 760 static int unix_getname(struct socket *, struct sockaddr *, int);
 761 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 762 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 763                                     poll_table *);
 764 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 765 #ifdef CONFIG_COMPAT
 766 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 767 #endif
 768 static int unix_shutdown(struct socket *, int);
 769 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 770 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 771 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 772                                        struct pipe_inode_info *, size_t size,
 773                                        unsigned int flags);
 774 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 775 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 776 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 777 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 778 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 779                               int, int);
 780 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 781 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 782                                   int);
 783
 784 static int unix_set_peek_off(struct sock *sk, int val)
 785 {
 786         struct unix_sock *u = unix_sk(sk);
 787
 788         if (mutex_lock_interruptible(&u->iolock))
 789                 return -EINTR;
 790
 791         WRITE_ONCE(sk->sk_peek_off, val);
 792         mutex_unlock(&u->iolock);
 793
 794         return 0;
 795 }
 796
 797 #ifdef CONFIG_PROC_FS
 798 static int unix_count_nr_fds(struct sock *sk)
 799 {
 800         struct sk_buff *skb;
 801         struct unix_sock *u;
 802         int nr_fds = 0;
 803
 804         spin_lock(&sk->sk_receive_queue.lock);
 805         skb = skb_peek(&sk->sk_receive_queue);
 806         while (skb) {
 807                 u = unix_sk(skb->sk);
 808                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 809                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 810         }
 811         spin_unlock(&sk->sk_receive_queue.lock);
 812
 813         return nr_fds;
 814 }
 815
 816 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 817 {
 818         struct sock *sk = sock->sk;
 819         unsigned char s_state;
 820         struct unix_sock *u;
 821         int nr_fds = 0;
 822
 823         if (sk) {
 824                 s_state = READ_ONCE(sk->sk_state);
 825                 u = unix_sk(sk);
 826
 827                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 828                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 829                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 830                  */
 831                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 832                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 833                 else if (s_state == TCP_LISTEN)
 834                         nr_fds = unix_count_nr_fds(sk);
 835
 836                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 837         }
 838 }
 839 #else
 840 #define unix_show_fdinfo NULL
 841 #endif
 842
 843 static const struct proto_ops unix_stream_ops = {
 844         .family =       PF_UNIX,
 845         .owner =        THIS_MODULE,
 846         .release =      unix_release,
 847         .bind =         unix_bind,
 848         .connect =      unix_stream_connect,
 849         .socketpair =   unix_socketpair,
 850         .accept =       unix_accept,
 851         .getname =      unix_getname,
 852         .poll =         unix_poll,
 853         .ioctl =        unix_ioctl,
 854 #ifdef CONFIG_COMPAT
 855         .compat_ioctl = unix_compat_ioctl,
 856 #endif
 857         .listen =       unix_listen,
 858         .shutdown =     unix_shutdown,
 859         .sendmsg =      unix_stream_sendmsg,
 860         .recvmsg =      unix_stream_recvmsg,
 861         .read_skb =     unix_stream_read_skb,
 862         .mmap =         sock_no_mmap,
 863         .splice_read =  unix_stream_splice_read,
 864         .set_peek_off = unix_set_peek_off,
 865         .show_fdinfo =  unix_show_fdinfo,
 866 };
 867
 868 static const struct proto_ops unix_dgram_ops = {
 869         .family =       PF_UNIX,
 870         .owner =        THIS_MODULE,
 871         .release =      unix_release,
 872         .bind =         unix_bind,
 873         .connect =      unix_dgram_connect,
 874         .socketpair =   unix_socketpair,
 875         .accept =       sock_no_accept,
 876         .getname =      unix_getname,
 877         .poll =         unix_dgram_poll,
 878         .ioctl =        unix_ioctl,
 879 #ifdef CONFIG_COMPAT
 880         .compat_ioctl = unix_compat_ioctl,
 881 #endif
 882         .listen =       sock_no_listen,
 883         .shutdown =     unix_shutdown,
 884         .sendmsg =      unix_dgram_sendmsg,
 885         .read_skb =     unix_read_skb,
 886         .recvmsg =      unix_dgram_recvmsg,
 887         .mmap =         sock_no_mmap,
 888         .set_peek_off = unix_set_peek_off,
 889         .show_fdinfo =  unix_show_fdinfo,
 890 };
 891
 892 static const struct proto_ops unix_seqpacket_ops = {
 893         .family =       PF_UNIX,
 894         .owner =        THIS_MODULE,
 895         .release =      unix_release,
 896         .bind =         unix_bind,
 897         .connect =      unix_stream_connect,
 898         .socketpair =   unix_socketpair,
 899         .accept =       unix_accept,
 900         .getname =      unix_getname,
 901         .poll =         unix_dgram_poll,
 902         .ioctl =        unix_ioctl,
 903 #ifdef CONFIG_COMPAT
 904         .compat_ioctl = unix_compat_ioctl,
 905 #endif
 906         .listen =       unix_listen,
 907         .shutdown =     unix_shutdown,
 908         .sendmsg =      unix_seqpacket_sendmsg,
 909         .recvmsg =      unix_seqpacket_recvmsg,
 910         .mmap =         sock_no_mmap,
 911         .set_peek_off = unix_set_peek_off,
 912         .show_fdinfo =  unix_show_fdinfo,
 913 };
 914
 915 static void unix_close(struct sock *sk, long timeout)
 916 {
 917         /* Nothing to do here, unix socket does not need a ->close().
 918          * This is merely for sockmap.
 919          */
 920 }
 921
 922 static void unix_unhash(struct sock *sk)
 923 {
 924         /* Nothing to do here, unix socket does not need a ->unhash().
 925          * This is merely for sockmap.
 926          */
 927 }
 928
 929 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 930 {
 931         if (level == SOL_SOCKET) {
 932                 switch (optname) {
 933                 case SO_PEERPIDFD:
 934                         return true;
 935                 default:
 936                         return false;
 937                 }
 938         }
 939
 940         return false;
 941 }
 942
 943 struct proto unix_dgram_proto = {
 944         .name                   = "UNIX",
 945         .owner                  = THIS_MODULE,
 946         .obj_size               = sizeof(struct unix_sock),
 947         .close                  = unix_close,
 948         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 949 #ifdef CONFIG_BPF_SYSCALL
 950         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 951 #endif
 952 };
 953
 954 struct proto unix_stream_proto = {
 955         .name                   = "UNIX-STREAM",
 956         .owner                  = THIS_MODULE,
 957         .obj_size               = sizeof(struct unix_sock),
 958         .close                  = unix_close,
 959         .unhash                 = unix_unhash,
 960         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 961 #ifdef CONFIG_BPF_SYSCALL
 962         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 963 #endif
 964 };
 965
 966 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 967 {
 968         struct unix_sock *u;
 969         struct sock *sk;
 970         int err;
 971
 972         atomic_long_inc(&unix_nr_socks);
 973         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 974                 err = -ENFILE;
 975                 goto err;
 976         }
 977
 978         if (type == SOCK_STREAM)
 979                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 980         else /*dgram and  seqpacket */
 981                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 982
 983         if (!sk) {
 984                 err = -ENOMEM;
 985                 goto err;
 986         }
 987
 988         sock_init_data(sock, sk);
 989
 990         sk->sk_hash             = unix_unbound_hash(sk);
 991         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 992         sk->sk_write_space      = unix_write_space;
 993         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 994         sk->sk_destruct         = unix_sock_destructor;
 995         u         = unix_sk(sk);
 996         u->path.dentry = NULL;
 997         u->path.mnt = NULL;
 998         spin_lock_init(&u->lock);
 999         atomic_long_set(&u->inflight, 0);
1000         INIT_LIST_HEAD(&u->link);
1001         mutex_init(&u->iolock); /* single task reading lock */
1002         mutex_init(&u->bindlock); /* single task binding lock */
1003         init_waitqueue_head(&u->peer_wait);
1004         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1005         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1006         unix_insert_unbound_socket(net, sk);
1007
1008         sock_prot_inuse_add(net, sk->sk_prot, 1);
1009
1010         return sk;
1011
1012 err:
1013         atomic_long_dec(&unix_nr_socks);
1014         return ERR_PTR(err);
1015 }
1016
1017 static int unix_create(struct net *net, struct socket *sock, int protocol,
1018                        int kern)
1019 {
1020         struct sock *sk;
1021
1022         if (protocol && protocol != PF_UNIX)
1023                 return -EPROTONOSUPPORT;
1024
1025         sock->state = SS_UNCONNECTED;
1026
1027         switch (sock->type) {
1028         case SOCK_STREAM:
1029                 sock->ops = &unix_stream_ops;
1030                 break;
1031                 /*
1032                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1033                  *      nothing uses it.
1034                  */
1035         case SOCK_RAW:
1036                 sock->type = SOCK_DGRAM;
1037                 fallthrough;
1038         case SOCK_DGRAM:
1039                 sock->ops = &unix_dgram_ops;
1040                 break;
1041         case SOCK_SEQPACKET:
1042                 sock->ops = &unix_seqpacket_ops;
1043                 break;
1044         default:
1045                 return -ESOCKTNOSUPPORT;
1046         }
1047
1048         sk = unix_create1(net, sock, kern, sock->type);
1049         if (IS_ERR(sk))
1050                 return PTR_ERR(sk);
1051
1052         return 0;
1053 }
1054
1055 static int unix_release(struct socket *sock)
1056 {
1057         struct sock *sk = sock->sk;
1058
1059         if (!sk)
1060                 return 0;
1061
1062         sk->sk_prot->close(sk, 0);
1063         unix_release_sock(sk, 0);
1064         sock->sk = NULL;
1065
1066         return 0;
1067 }
1068
1069 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1070                                   int type)
1071 {
1072         struct inode *inode;
1073         struct path path;
1074         struct sock *sk;
1075         int err;
1076
1077         unix_mkname_bsd(sunaddr, addr_len);
1078         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1079         if (err)
1080                 goto fail;
1081
1082         err = path_permission(&path, MAY_WRITE);
1083         if (err)
1084                 goto path_put;
1085
1086         err = -ECONNREFUSED;
1087         inode = d_backing_inode(path.dentry);
1088         if (!S_ISSOCK(inode->i_mode))
1089                 goto path_put;
1090
1091         sk = unix_find_socket_byinode(inode);
1092         if (!sk)
1093                 goto path_put;
1094
1095         err = -EPROTOTYPE;
1096         if (sk->sk_type == type)
1097                 touch_atime(&path);
1098         else
1099                 goto sock_put;
1100
1101         path_put(&path);
1102
1103         return sk;
1104
1105 sock_put:
1106         sock_put(sk);
1107 path_put:
1108         path_put(&path);
1109 fail:
1110         return ERR_PTR(err);
1111 }
1112
1113 static struct sock *unix_find_abstract(struct net *net,
1114                                        struct sockaddr_un *sunaddr,
1115                                        int addr_len, int type)
1116 {
1117         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1118         struct dentry *dentry;
1119         struct sock *sk;
1120
1121         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1122         if (!sk)
1123                 return ERR_PTR(-ECONNREFUSED);
1124
1125         dentry = unix_sk(sk)->path.dentry;
1126         if (dentry)
1127                 touch_atime(&unix_sk(sk)->path);
1128
1129         return sk;
1130 }
1131
1132 static struct sock *unix_find_other(struct net *net,
1133                                     struct sockaddr_un *sunaddr,
1134                                     int addr_len, int type)
1135 {
1136         struct sock *sk;
1137
1138         if (sunaddr->sun_path[0])
1139                 sk = unix_find_bsd(sunaddr, addr_len, type);
1140         else
1141                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1142
1143         return sk;
1144 }
1145
1146 static int unix_autobind(struct sock *sk)
1147 {
1148         unsigned int new_hash, old_hash = sk->sk_hash;
1149         struct unix_sock *u = unix_sk(sk);
1150         struct net *net = sock_net(sk);
1151         struct unix_address *addr;
1152         u32 lastnum, ordernum;
1153         int err;
1154
1155         err = mutex_lock_interruptible(&u->bindlock);
1156         if (err)
1157                 return err;
1158
1159         if (u->addr)
1160                 goto out;
1161
1162         err = -ENOMEM;
1163         addr = kzalloc(sizeof(*addr) +
1164                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1165         if (!addr)
1166                 goto out;
1167
1168         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1169         addr->name->sun_family = AF_UNIX;
1170         refcount_set(&addr->refcnt, 1);
1171
1172         ordernum = get_random_u32();
1173         lastnum = ordernum & 0xFFFFF;
1174 retry:
1175         ordernum = (ordernum + 1) & 0xFFFFF;
1176         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1177
1178         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1179         unix_table_double_lock(net, old_hash, new_hash);
1180
1181         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1182                 unix_table_double_unlock(net, old_hash, new_hash);
1183
1184                 /* __unix_find_socket_byname() may take long time if many names
1185                  * are already in use.
1186                  */
1187                 cond_resched();
1188
1189                 if (ordernum == lastnum) {
1190                         /* Give up if all names seems to be in use. */
1191                         err = -ENOSPC;
1192                         unix_release_addr(addr);
1193                         goto out;
1194                 }
1195
1196                 goto retry;
1197         }
1198
1199         __unix_set_addr_hash(net, sk, addr, new_hash);
1200         unix_table_double_unlock(net, old_hash, new_hash);
1201         err = 0;
1202
1203 out:    mutex_unlock(&u->bindlock);
1204         return err;
1205 }
1206
1207 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1208                          int addr_len)
1209 {
1210         umode_t mode = S_IFSOCK |
1211                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1212         unsigned int new_hash, old_hash = sk->sk_hash;
1213         struct unix_sock *u = unix_sk(sk);
1214         struct net *net = sock_net(sk);
1215         struct mnt_idmap *idmap;
1216         struct unix_address *addr;
1217         struct dentry *dentry;
1218         struct path parent;
1219         int err;
1220
1221         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1222         addr = unix_create_addr(sunaddr, addr_len);
1223         if (!addr)
1224                 return -ENOMEM;
1225
1226         /*
1227          * Get the parent directory, calculate the hash for last
1228          * component.
1229          */
1230         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1231         if (IS_ERR(dentry)) {
1232                 err = PTR_ERR(dentry);
1233                 goto out;
1234         }
1235
1236         /*
1237          * All right, let's create it.
1238          */
1239         idmap = mnt_idmap(parent.mnt);
1240         err = security_path_mknod(&parent, dentry, mode, 0);
1241         if (!err)
1242                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1243         if (err)
1244                 goto out_path;
1245         err = mutex_lock_interruptible(&u->bindlock);
1246         if (err)
1247                 goto out_unlink;
1248         if (u->addr)
1249                 goto out_unlock;
1250
1251         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1252         unix_table_double_lock(net, old_hash, new_hash);
1253         u->path.mnt = mntget(parent.mnt);
1254         u->path.dentry = dget(dentry);
1255         __unix_set_addr_hash(net, sk, addr, new_hash);
1256         unix_table_double_unlock(net, old_hash, new_hash);
1257         unix_insert_bsd_socket(sk);
1258         mutex_unlock(&u->bindlock);
1259         done_path_create(&parent, dentry);
1260         return 0;
1261
1262 out_unlock:
1263         mutex_unlock(&u->bindlock);
1264         err = -EINVAL;
1265 out_unlink:
1266         /* failed after successful mknod?  unlink what we'd created... */
1267         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1268 out_path:
1269         done_path_create(&parent, dentry);
1270 out:
1271         unix_release_addr(addr);
1272         return err == -EEXIST ? -EADDRINUSE : err;
1273 }
1274
1275 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1276                               int addr_len)
1277 {
1278         unsigned int new_hash, old_hash = sk->sk_hash;
1279         struct unix_sock *u = unix_sk(sk);
1280         struct net *net = sock_net(sk);
1281         struct unix_address *addr;
1282         int err;
1283
1284         addr = unix_create_addr(sunaddr, addr_len);
1285         if (!addr)
1286                 return -ENOMEM;
1287
1288         err = mutex_lock_interruptible(&u->bindlock);
1289         if (err)
1290                 goto out;
1291
1292         if (u->addr) {
1293                 err = -EINVAL;
1294                 goto out_mutex;
1295         }
1296
1297         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1298         unix_table_double_lock(net, old_hash, new_hash);
1299
1300         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1301                 goto out_spin;
1302
1303         __unix_set_addr_hash(net, sk, addr, new_hash);
1304         unix_table_double_unlock(net, old_hash, new_hash);
1305         mutex_unlock(&u->bindlock);
1306         return 0;
1307
1308 out_spin:
1309         unix_table_double_unlock(net, old_hash, new_hash);
1310         err = -EADDRINUSE;
1311 out_mutex:
1312         mutex_unlock(&u->bindlock);
1313 out:
1314         unix_release_addr(addr);
1315         return err;
1316 }
1317
1318 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1319 {
1320         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1321         struct sock *sk = sock->sk;
1322         int err;
1323
1324         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1325             sunaddr->sun_family == AF_UNIX)
1326                 return unix_autobind(sk);
1327
1328         err = unix_validate_addr(sunaddr, addr_len);
1329         if (err)
1330                 return err;
1331
1332         if (sunaddr->sun_path[0])
1333                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1334         else
1335                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1336
1337         return err;
1338 }
1339
1340 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1341 {
1342         if (unlikely(sk1 == sk2) || !sk2) {
1343                 unix_state_lock(sk1);
1344                 return;
1345         }
1346         if (sk1 > sk2)
1347                 swap(sk1, sk2);
1348
1349         unix_state_lock(sk1);
1350         unix_state_lock_nested(sk2, U_LOCK_SECOND);
1351 }
1352
1353 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1354 {
1355         if (unlikely(sk1 == sk2) || !sk2) {
1356                 unix_state_unlock(sk1);
1357                 return;
1358         }
1359         unix_state_unlock(sk1);
1360         unix_state_unlock(sk2);
1361 }
1362
1363 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1364                               int alen, int flags)
1365 {
1366         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1367         struct sock *sk = sock->sk;
1368         struct sock *other;
1369         int err;
1370
1371         err = -EINVAL;
1372         if (alen < offsetofend(struct sockaddr, sa_family))
1373                 goto out;
1374
1375         if (addr->sa_family != AF_UNSPEC) {
1376                 err = unix_validate_addr(sunaddr, alen);
1377                 if (err)
1378                         goto out;
1379
1380                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1381                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1382                     !unix_sk(sk)->addr) {
1383                         err = unix_autobind(sk);
1384                         if (err)
1385                                 goto out;
1386                 }
1387
1388 restart:
1389                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1390                 if (IS_ERR(other)) {
1391                         err = PTR_ERR(other);
1392                         goto out;
1393                 }
1394
1395                 unix_state_double_lock(sk, other);
1396
1397                 /* Apparently VFS overslept socket death. Retry. */
1398                 if (sock_flag(other, SOCK_DEAD)) {
1399                         unix_state_double_unlock(sk, other);
1400                         sock_put(other);
1401                         goto restart;
1402                 }
1403
1404                 err = -EPERM;
1405                 if (!unix_may_send(sk, other))
1406                         goto out_unlock;
1407
1408                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1409                 if (err)
1410                         goto out_unlock;
1411
1412                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1413         } else {
1414                 /*
1415                  *      1003.1g breaking connected state with AF_UNSPEC
1416                  */
1417                 other = NULL;
1418                 unix_state_double_lock(sk, other);
1419         }
1420
1421         /*
1422          * If it was connected, reconnect.
1423          */
1424         if (unix_peer(sk)) {
1425                 struct sock *old_peer = unix_peer(sk);
1426
1427                 unix_peer(sk) = other;
1428                 if (!other)
1429                         sk->sk_state = TCP_CLOSE;
1430                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1431
1432                 unix_state_double_unlock(sk, other);
1433
1434                 if (other != old_peer)
1435                         unix_dgram_disconnected(sk, old_peer);
1436                 sock_put(old_peer);
1437         } else {
1438                 unix_peer(sk) = other;
1439                 unix_state_double_unlock(sk, other);
1440         }
1441
1442         return 0;
1443
1444 out_unlock:
1445         unix_state_double_unlock(sk, other);
1446         sock_put(other);
1447 out:
1448         return err;
1449 }
1450
1451 static long unix_wait_for_peer(struct sock *other, long timeo)
1452         __releases(&unix_sk(other)->lock)
1453 {
1454         struct unix_sock *u = unix_sk(other);
1455         int sched;
1456         DEFINE_WAIT(wait);
1457
1458         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1459
1460         sched = !sock_flag(other, SOCK_DEAD) &&
1461                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1462                 unix_recvq_full_lockless(other);
1463
1464         unix_state_unlock(other);
1465
1466         if (sched)
1467                 timeo = schedule_timeout(timeo);
1468
1469         finish_wait(&u->peer_wait, &wait);
1470         return timeo;
1471 }
1472
1473 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1474                                int addr_len, int flags)
1475 {
1476         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1477         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1478         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1479         struct net *net = sock_net(sk);
1480         struct sk_buff *skb = NULL;
1481         long timeo;
1482         int err;
1483         int st;
1484
1485         err = unix_validate_addr(sunaddr, addr_len);
1486         if (err)
1487                 goto out;
1488
1489         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1490              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1491                 err = unix_autobind(sk);
1492                 if (err)
1493                         goto out;
1494         }
1495
1496         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1497
1498         /* First of all allocate resources.
1499            If we will make it after state is locked,
1500            we will have to recheck all again in any case.
1501          */
1502
1503         /* create new sock for complete connection */
1504         newsk = unix_create1(net, NULL, 0, sock->type);
1505         if (IS_ERR(newsk)) {
1506                 err = PTR_ERR(newsk);
1507                 newsk = NULL;
1508                 goto out;
1509         }
1510
1511         err = -ENOMEM;
1512
1513         /* Allocate skb for sending to listening sock */
1514         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1515         if (skb == NULL)
1516                 goto out;
1517
1518 restart:
1519         /*  Find listening sock. */
1520         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1521         if (IS_ERR(other)) {
1522                 err = PTR_ERR(other);
1523                 other = NULL;
1524                 goto out;
1525         }
1526
1527         /* Latch state of peer */
1528         unix_state_lock(other);
1529
1530         /* Apparently VFS overslept socket death. Retry. */
1531         if (sock_flag(other, SOCK_DEAD)) {
1532                 unix_state_unlock(other);
1533                 sock_put(other);
1534                 goto restart;
1535         }
1536
1537         err = -ECONNREFUSED;
1538         if (other->sk_state != TCP_LISTEN)
1539                 goto out_unlock;
1540         if (other->sk_shutdown & RCV_SHUTDOWN)
1541                 goto out_unlock;
1542
1543         if (unix_recvq_full(other)) {
1544                 err = -EAGAIN;
1545                 if (!timeo)
1546                         goto out_unlock;
1547
1548                 timeo = unix_wait_for_peer(other, timeo);
1549
1550                 err = sock_intr_errno(timeo);
1551                 if (signal_pending(current))
1552                         goto out;
1553                 sock_put(other);
1554                 goto restart;
1555         }
1556
1557         /* Latch our state.
1558
1559            It is tricky place. We need to grab our state lock and cannot
1560            drop lock on peer. It is dangerous because deadlock is
1561            possible. Connect to self case and simultaneous
1562            attempt to connect are eliminated by checking socket
1563            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1564            check this before attempt to grab lock.
1565
1566            Well, and we have to recheck the state after socket locked.
1567          */
1568         st = sk->sk_state;
1569
1570         switch (st) {
1571         case TCP_CLOSE:
1572                 /* This is ok... continue with connect */
1573                 break;
1574         case TCP_ESTABLISHED:
1575                 /* Socket is already connected */
1576                 err = -EISCONN;
1577                 goto out_unlock;
1578         default:
1579                 err = -EINVAL;
1580                 goto out_unlock;
1581         }
1582
1583         unix_state_lock_nested(sk, U_LOCK_SECOND);
1584
1585         if (sk->sk_state != st) {
1586                 unix_state_unlock(sk);
1587                 unix_state_unlock(other);
1588                 sock_put(other);
1589                 goto restart;
1590         }
1591
1592         err = security_unix_stream_connect(sk, other, newsk);
1593         if (err) {
1594                 unix_state_unlock(sk);
1595                 goto out_unlock;
1596         }
1597
1598         /* The way is open! Fastly set all the necessary fields... */
1599
1600         sock_hold(sk);
1601         unix_peer(newsk)        = sk;
1602         newsk->sk_state         = TCP_ESTABLISHED;
1603         newsk->sk_type          = sk->sk_type;
1604         init_peercred(newsk);
1605         newu = unix_sk(newsk);
1606         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1607         otheru = unix_sk(other);
1608
1609         /* copy address information from listening to new sock
1610          *
1611          * The contents of *(otheru->addr) and otheru->path
1612          * are seen fully set up here, since we have found
1613          * otheru in hash under its lock.  Insertion into the
1614          * hash chain we'd found it in had been done in an
1615          * earlier critical area protected by the chain's lock,
1616          * the same one where we'd set *(otheru->addr) contents,
1617          * as well as otheru->path and otheru->addr itself.
1618          *
1619          * Using smp_store_release() here to set newu->addr
1620          * is enough to make those stores, as well as stores
1621          * to newu->path visible to anyone who gets newu->addr
1622          * by smp_load_acquire().  IOW, the same warranties
1623          * as for unix_sock instances bound in unix_bind() or
1624          * in unix_autobind().
1625          */
1626         if (otheru->path.dentry) {
1627                 path_get(&otheru->path);
1628                 newu->path = otheru->path;
1629         }
1630         refcount_inc(&otheru->addr->refcnt);
1631         smp_store_release(&newu->addr, otheru->addr);
1632
1633         /* Set credentials */
1634         copy_peercred(sk, other);
1635
1636         sock->state     = SS_CONNECTED;
1637         sk->sk_state    = TCP_ESTABLISHED;
1638         sock_hold(newsk);
1639
1640         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1641         unix_peer(sk)   = newsk;
1642
1643         unix_state_unlock(sk);
1644
1645         /* take ten and send info to listening sock */
1646         spin_lock(&other->sk_receive_queue.lock);
1647         __skb_queue_tail(&other->sk_receive_queue, skb);
1648         spin_unlock(&other->sk_receive_queue.lock);
1649         unix_state_unlock(other);
1650         other->sk_data_ready(other);
1651         sock_put(other);
1652         return 0;
1653
1654 out_unlock:
1655         if (other)
1656                 unix_state_unlock(other);
1657
1658 out:
1659         kfree_skb(skb);
1660         if (newsk)
1661                 unix_release_sock(newsk, 0);
1662         if (other)
1663                 sock_put(other);
1664         return err;
1665 }
1666
1667 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1668 {
1669         struct sock *ska = socka->sk, *skb = sockb->sk;
1670
1671         /* Join our sockets back to back */
1672         sock_hold(ska);
1673         sock_hold(skb);
1674         unix_peer(ska) = skb;
1675         unix_peer(skb) = ska;
1676         init_peercred(ska);
1677         init_peercred(skb);
1678
1679         ska->sk_state = TCP_ESTABLISHED;
1680         skb->sk_state = TCP_ESTABLISHED;
1681         socka->state  = SS_CONNECTED;
1682         sockb->state  = SS_CONNECTED;
1683         return 0;
1684 }
1685
1686 static void unix_sock_inherit_flags(const struct socket *old,
1687                                     struct socket *new)
1688 {
1689         if (test_bit(SOCK_PASSCRED, &old->flags))
1690                 set_bit(SOCK_PASSCRED, &new->flags);
1691         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1692                 set_bit(SOCK_PASSPIDFD, &new->flags);
1693         if (test_bit(SOCK_PASSSEC, &old->flags))
1694                 set_bit(SOCK_PASSSEC, &new->flags);
1695 }
1696
1697 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1698                        bool kern)
1699 {
1700         struct sock *sk = sock->sk;
1701         struct sock *tsk;
1702         struct sk_buff *skb;
1703         int err;
1704
1705         err = -EOPNOTSUPP;
1706         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1707                 goto out;
1708
1709         err = -EINVAL;
1710         if (sk->sk_state != TCP_LISTEN)
1711                 goto out;
1712
1713         /* If socket state is TCP_LISTEN it cannot change (for now...),
1714          * so that no locks are necessary.
1715          */
1716
1717         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1718                                 &err);
1719         if (!skb) {
1720                 /* This means receive shutdown. */
1721                 if (err == 0)
1722                         err = -EINVAL;
1723                 goto out;
1724         }
1725
1726         tsk = skb->sk;
1727         skb_free_datagram(sk, skb);
1728         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1729
1730         /* attach accepted sock to socket */
1731         unix_state_lock(tsk);
1732         newsock->state = SS_CONNECTED;
1733         unix_sock_inherit_flags(sock, newsock);
1734         sock_graft(tsk, newsock);
1735         unix_state_unlock(tsk);
1736         return 0;
1737
1738 out:
1739         return err;
1740 }
1741
1742
1743 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1744 {
1745         struct sock *sk = sock->sk;
1746         struct unix_address *addr;
1747         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1748         int err = 0;
1749
1750         if (peer) {
1751                 sk = unix_peer_get(sk);
1752
1753                 err = -ENOTCONN;
1754                 if (!sk)
1755                         goto out;
1756                 err = 0;
1757         } else {
1758                 sock_hold(sk);
1759         }
1760
1761         addr = smp_load_acquire(&unix_sk(sk)->addr);
1762         if (!addr) {
1763                 sunaddr->sun_family = AF_UNIX;
1764                 sunaddr->sun_path[0] = 0;
1765                 err = offsetof(struct sockaddr_un, sun_path);
1766         } else {
1767                 err = addr->len;
1768                 memcpy(sunaddr, addr->name, addr->len);
1769         }
1770         sock_put(sk);
1771 out:
1772         return err;
1773 }
1774
1775 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1776 {
1777         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1778
1779         /*
1780          * Garbage collection of unix sockets starts by selecting a set of
1781          * candidate sockets which have reference only from being in flight
1782          * (total_refs == inflight_refs).  This condition is checked once during
1783          * the candidate collection phase, and candidates are marked as such, so
1784          * that non-candidates can later be ignored.  While inflight_refs is
1785          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1786          * is an instantaneous decision.
1787          *
1788          * Once a candidate, however, the socket must not be reinstalled into a
1789          * file descriptor while the garbage collection is in progress.
1790          *
1791          * If the above conditions are met, then the directed graph of
1792          * candidates (*) does not change while unix_gc_lock is held.
1793          *
1794          * Any operations that changes the file count through file descriptors
1795          * (dup, close, sendmsg) does not change the graph since candidates are
1796          * not installed in fds.
1797          *
1798          * Dequeing a candidate via recvmsg would install it into an fd, but
1799          * that takes unix_gc_lock to decrement the inflight count, so it's
1800          * serialized with garbage collection.
1801          *
1802          * MSG_PEEK is special in that it does not change the inflight count,
1803          * yet does install the socket into an fd.  The following lock/unlock
1804          * pair is to ensure serialization with garbage collection.  It must be
1805          * done between incrementing the file count and installing the file into
1806          * an fd.
1807          *
1808          * If garbage collection starts after the barrier provided by the
1809          * lock/unlock, then it will see the elevated refcount and not mark this
1810          * as a candidate.  If a garbage collection is already in progress
1811          * before the file count was incremented, then the lock/unlock pair will
1812          * ensure that garbage collection is finished before progressing to
1813          * installing the fd.
1814          *
1815          * (*) A -> B where B is on the queue of A or B is on the queue of C
1816          * which is on the queue of listening socket A.
1817          */
1818         spin_lock(&unix_gc_lock);
1819         spin_unlock(&unix_gc_lock);
1820 }
1821
1822 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1823 {
1824         int err = 0;
1825
1826         UNIXCB(skb).pid  = get_pid(scm->pid);
1827         UNIXCB(skb).uid = scm->creds.uid;
1828         UNIXCB(skb).gid = scm->creds.gid;
1829         UNIXCB(skb).fp = NULL;
1830         unix_get_secdata(scm, skb);
1831         if (scm->fp && send_fds)
1832                 err = unix_attach_fds(scm, skb);
1833
1834         skb->destructor = unix_destruct_scm;
1835         return err;
1836 }
1837
1838 static bool unix_passcred_enabled(const struct socket *sock,
1839                                   const struct sock *other)
1840 {
1841         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1842                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1843                !other->sk_socket ||
1844                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1845                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1846 }
1847
1848 /*
1849  * Some apps rely on write() giving SCM_CREDENTIALS
1850  * We include credentials if source or destination socket
1851  * asserted SOCK_PASSCRED.
1852  */
1853 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1854                             const struct sock *other)
1855 {
1856         if (UNIXCB(skb).pid)
1857                 return;
1858         if (unix_passcred_enabled(sock, other)) {
1859                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1860                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1861         }
1862 }
1863
1864 static bool unix_skb_scm_eq(struct sk_buff *skb,
1865                             struct scm_cookie *scm)
1866 {
1867         return UNIXCB(skb).pid == scm->pid &&
1868                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1869                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1870                unix_secdata_eq(scm, skb);
1871 }
1872
1873 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1874 {
1875         struct scm_fp_list *fp = UNIXCB(skb).fp;
1876         struct unix_sock *u = unix_sk(sk);
1877
1878         if (unlikely(fp && fp->count))
1879                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1880 }
1881
1882 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1883 {
1884         struct scm_fp_list *fp = UNIXCB(skb).fp;
1885         struct unix_sock *u = unix_sk(sk);
1886
1887         if (unlikely(fp && fp->count))
1888                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1889 }
1890
1891 /*
1892  *      Send AF_UNIX data.
1893  */
1894
1895 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1896                               size_t len)
1897 {
1898         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1899         struct sock *sk = sock->sk, *other = NULL;
1900         struct unix_sock *u = unix_sk(sk);
1901         struct scm_cookie scm;
1902         struct sk_buff *skb;
1903         int data_len = 0;
1904         int sk_locked;
1905         long timeo;
1906         int err;
1907
1908         wait_for_unix_gc();
1909         err = scm_send(sock, msg, &scm, false);
1910         if (err < 0)
1911                 return err;
1912
1913         err = -EOPNOTSUPP;
1914         if (msg->msg_flags&MSG_OOB)
1915                 goto out;
1916
1917         if (msg->msg_namelen) {
1918                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1919                 if (err)
1920                         goto out;
1921         } else {
1922                 sunaddr = NULL;
1923                 err = -ENOTCONN;
1924                 other = unix_peer_get(sk);
1925                 if (!other)
1926                         goto out;
1927         }
1928
1929         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1930              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1931                 err = unix_autobind(sk);
1932                 if (err)
1933                         goto out;
1934         }
1935
1936         err = -EMSGSIZE;
1937         if (len > sk->sk_sndbuf - 32)
1938                 goto out;
1939
1940         if (len > SKB_MAX_ALLOC) {
1941                 data_len = min_t(size_t,
1942                                  len - SKB_MAX_ALLOC,
1943                                  MAX_SKB_FRAGS * PAGE_SIZE);
1944                 data_len = PAGE_ALIGN(data_len);
1945
1946                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1947         }
1948
1949         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1950                                    msg->msg_flags & MSG_DONTWAIT, &err,
1951                                    PAGE_ALLOC_COSTLY_ORDER);
1952         if (skb == NULL)
1953                 goto out;
1954
1955         err = unix_scm_to_skb(&scm, skb, true);
1956         if (err < 0)
1957                 goto out_free;
1958
1959         skb_put(skb, len - data_len);
1960         skb->data_len = data_len;
1961         skb->len = len;
1962         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1963         if (err)
1964                 goto out_free;
1965
1966         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1967
1968 restart:
1969         if (!other) {
1970                 err = -ECONNRESET;
1971                 if (sunaddr == NULL)
1972                         goto out_free;
1973
1974                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1975                                         sk->sk_type);
1976                 if (IS_ERR(other)) {
1977                         err = PTR_ERR(other);
1978                         other = NULL;
1979                         goto out_free;
1980                 }
1981         }
1982
1983         if (sk_filter(other, skb) < 0) {
1984                 /* Toss the packet but do not return any error to the sender */
1985                 err = len;
1986                 goto out_free;
1987         }
1988
1989         sk_locked = 0;
1990         unix_state_lock(other);
1991 restart_locked:
1992         err = -EPERM;
1993         if (!unix_may_send(sk, other))
1994                 goto out_unlock;
1995
1996         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1997                 /*
1998                  *      Check with 1003.1g - what should
1999                  *      datagram error
2000                  */
2001                 unix_state_unlock(other);
2002                 sock_put(other);
2003
2004                 if (!sk_locked)
2005                         unix_state_lock(sk);
2006
2007                 err = 0;
2008                 if (sk->sk_type == SOCK_SEQPACKET) {
2009                         /* We are here only when racing with unix_release_sock()
2010                          * is clearing @other. Never change state to TCP_CLOSE
2011                          * unlike SOCK_DGRAM wants.
2012                          */
2013                         unix_state_unlock(sk);
2014                         err = -EPIPE;
2015                 } else if (unix_peer(sk) == other) {
2016                         unix_peer(sk) = NULL;
2017                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2018
2019                         sk->sk_state = TCP_CLOSE;
2020                         unix_state_unlock(sk);
2021
2022                         unix_dgram_disconnected(sk, other);
2023                         sock_put(other);
2024                         err = -ECONNREFUSED;
2025                 } else {
2026                         unix_state_unlock(sk);
2027                 }
2028
2029                 other = NULL;
2030                 if (err)
2031                         goto out_free;
2032                 goto restart;
2033         }
2034
2035         err = -EPIPE;
2036         if (other->sk_shutdown & RCV_SHUTDOWN)
2037                 goto out_unlock;
2038
2039         if (sk->sk_type != SOCK_SEQPACKET) {
2040                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2041                 if (err)
2042                         goto out_unlock;
2043         }
2044
2045         /* other == sk && unix_peer(other) != sk if
2046          * - unix_peer(sk) == NULL, destination address bound to sk
2047          * - unix_peer(sk) == sk by time of get but disconnected before lock
2048          */
2049         if (other != sk &&
2050             unlikely(unix_peer(other) != sk &&
2051             unix_recvq_full_lockless(other))) {
2052                 if (timeo) {
2053                         timeo = unix_wait_for_peer(other, timeo);
2054
2055                         err = sock_intr_errno(timeo);
2056                         if (signal_pending(current))
2057                                 goto out_free;
2058
2059                         goto restart;
2060                 }
2061
2062                 if (!sk_locked) {
2063                         unix_state_unlock(other);
2064                         unix_state_double_lock(sk, other);
2065                 }
2066
2067                 if (unix_peer(sk) != other ||
2068                     unix_dgram_peer_wake_me(sk, other)) {
2069                         err = -EAGAIN;
2070                         sk_locked = 1;
2071                         goto out_unlock;
2072                 }
2073
2074                 if (!sk_locked) {
2075                         sk_locked = 1;
2076                         goto restart_locked;
2077                 }
2078         }
2079
2080         if (unlikely(sk_locked))
2081                 unix_state_unlock(sk);
2082
2083         if (sock_flag(other, SOCK_RCVTSTAMP))
2084                 __net_timestamp(skb);
2085         maybe_add_creds(skb, sock, other);
2086         scm_stat_add(other, skb);
2087         skb_queue_tail(&other->sk_receive_queue, skb);
2088         unix_state_unlock(other);
2089         other->sk_data_ready(other);
2090         sock_put(other);
2091         scm_destroy(&scm);
2092         return len;
2093
2094 out_unlock:
2095         if (sk_locked)
2096                 unix_state_unlock(sk);
2097         unix_state_unlock(other);
2098 out_free:
2099         kfree_skb(skb);
2100 out:
2101         if (other)
2102                 sock_put(other);
2103         scm_destroy(&scm);
2104         return err;
2105 }
2106
2107 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2108  * bytes, and a minimum of a full page.
2109  */
2110 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2111
2112 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2113 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2114                      struct scm_cookie *scm, bool fds_sent)
2115 {
2116         struct unix_sock *ousk = unix_sk(other);
2117         struct sk_buff *skb;
2118         int err = 0;
2119
2120         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2121
2122         if (!skb)
2123                 return err;
2124
2125         err = unix_scm_to_skb(scm, skb, !fds_sent);
2126         if (err < 0) {
2127                 kfree_skb(skb);
2128                 return err;
2129         }
2130         skb_put(skb, 1);
2131         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2132
2133         if (err) {
2134                 kfree_skb(skb);
2135                 return err;
2136         }
2137
2138         unix_state_lock(other);
2139
2140         if (sock_flag(other, SOCK_DEAD) ||
2141             (other->sk_shutdown & RCV_SHUTDOWN)) {
2142                 unix_state_unlock(other);
2143                 kfree_skb(skb);
2144                 return -EPIPE;
2145         }
2146
2147         maybe_add_creds(skb, sock, other);
2148         skb_get(skb);
2149
2150         if (ousk->oob_skb)
2151                 consume_skb(ousk->oob_skb);
2152
2153         WRITE_ONCE(ousk->oob_skb, skb);
2154
2155         scm_stat_add(other, skb);
2156         skb_queue_tail(&other->sk_receive_queue, skb);
2157         sk_send_sigurg(other);
2158         unix_state_unlock(other);
2159         other->sk_data_ready(other);
2160
2161         return err;
2162 }
2163 #endif
2164
2165 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2166                                size_t len)
2167 {
2168         struct sock *sk = sock->sk;
2169         struct sock *other = NULL;
2170         int err, size;
2171         struct sk_buff *skb;
2172         int sent = 0;
2173         struct scm_cookie scm;
2174         bool fds_sent = false;
2175         int data_len;
2176
2177         wait_for_unix_gc();
2178         err = scm_send(sock, msg, &scm, false);
2179         if (err < 0)
2180                 return err;
2181
2182         err = -EOPNOTSUPP;
2183         if (msg->msg_flags & MSG_OOB) {
2184 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2185                 if (len)
2186                         len--;
2187                 else
2188 #endif
2189                         goto out_err;
2190         }
2191
2192         if (msg->msg_namelen) {
2193                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2194                 goto out_err;
2195         } else {
2196                 err = -ENOTCONN;
2197                 other = unix_peer(sk);
2198                 if (!other)
2199                         goto out_err;
2200         }
2201
2202         if (sk->sk_shutdown & SEND_SHUTDOWN)
2203                 goto pipe_err;
2204
2205         while (sent < len) {
2206                 size = len - sent;
2207
2208                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2209                         skb = sock_alloc_send_pskb(sk, 0, 0,
2210                                                    msg->msg_flags & MSG_DONTWAIT,
2211                                                    &err, 0);
2212                 } else {
2213                         /* Keep two messages in the pipe so it schedules better */
2214                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2215
2216                         /* allow fallback to order-0 allocations */
2217                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2218
2219                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2220
2221                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2222
2223                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2224                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2225                                                    get_order(UNIX_SKB_FRAGS_SZ));
2226                 }
2227                 if (!skb)
2228                         goto out_err;
2229
2230                 /* Only send the fds in the first buffer */
2231                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2232                 if (err < 0) {
2233                         kfree_skb(skb);
2234                         goto out_err;
2235                 }
2236                 fds_sent = true;
2237
2238                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2239                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2240                                                    sk->sk_allocation);
2241                         if (err < 0) {
2242                                 kfree_skb(skb);
2243                                 goto out_err;
2244                         }
2245                         size = err;
2246                         refcount_add(size, &sk->sk_wmem_alloc);
2247                 } else {
2248                         skb_put(skb, size - data_len);
2249                         skb->data_len = data_len;
2250                         skb->len = size;
2251                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2252                         if (err) {
2253                                 kfree_skb(skb);
2254                                 goto out_err;
2255                         }
2256                 }
2257
2258                 unix_state_lock(other);
2259
2260                 if (sock_flag(other, SOCK_DEAD) ||
2261                     (other->sk_shutdown & RCV_SHUTDOWN))
2262                         goto pipe_err_free;
2263
2264                 maybe_add_creds(skb, sock, other);
2265                 scm_stat_add(other, skb);
2266                 skb_queue_tail(&other->sk_receive_queue, skb);
2267                 unix_state_unlock(other);
2268                 other->sk_data_ready(other);
2269                 sent += size;
2270         }
2271
2272 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2273         if (msg->msg_flags & MSG_OOB) {
2274                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2275                 if (err)
2276                         goto out_err;
2277                 sent++;
2278         }
2279 #endif
2280
2281         scm_destroy(&scm);
2282
2283         return sent;
2284
2285 pipe_err_free:
2286         unix_state_unlock(other);
2287         kfree_skb(skb);
2288 pipe_err:
2289         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2290                 send_sig(SIGPIPE, current, 0);
2291         err = -EPIPE;
2292 out_err:
2293         scm_destroy(&scm);
2294         return sent ? : err;
2295 }
2296
2297 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2298                                   size_t len)
2299 {
2300         int err;
2301         struct sock *sk = sock->sk;
2302
2303         err = sock_error(sk);
2304         if (err)
2305                 return err;
2306
2307         if (sk->sk_state != TCP_ESTABLISHED)
2308                 return -ENOTCONN;
2309
2310         if (msg->msg_namelen)
2311                 msg->msg_namelen = 0;
2312
2313         return unix_dgram_sendmsg(sock, msg, len);
2314 }
2315
2316 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2317                                   size_t size, int flags)
2318 {
2319         struct sock *sk = sock->sk;
2320
2321         if (sk->sk_state != TCP_ESTABLISHED)
2322                 return -ENOTCONN;
2323
2324         return unix_dgram_recvmsg(sock, msg, size, flags);
2325 }
2326
2327 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2328 {
2329         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2330
2331         if (addr) {
2332                 msg->msg_namelen = addr->len;
2333                 memcpy(msg->msg_name, addr->name, addr->len);
2334         }
2335 }
2336
2337 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2338                          int flags)
2339 {
2340         struct scm_cookie scm;
2341         struct socket *sock = sk->sk_socket;
2342         struct unix_sock *u = unix_sk(sk);
2343         struct sk_buff *skb, *last;
2344         long timeo;
2345         int skip;
2346         int err;
2347
2348         err = -EOPNOTSUPP;
2349         if (flags&MSG_OOB)
2350                 goto out;
2351
2352         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2353
2354         do {
2355                 mutex_lock(&u->iolock);
2356
2357                 skip = sk_peek_offset(sk, flags);
2358                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2359                                               &skip, &err, &last);
2360                 if (skb) {
2361                         if (!(flags & MSG_PEEK))
2362                                 scm_stat_del(sk, skb);
2363                         break;
2364                 }
2365
2366                 mutex_unlock(&u->iolock);
2367
2368                 if (err != -EAGAIN)
2369                         break;
2370         } while (timeo &&
2371                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2372                                               &err, &timeo, last));
2373
2374         if (!skb) { /* implies iolock unlocked */
2375                 unix_state_lock(sk);
2376                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2377                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2378                     (sk->sk_shutdown & RCV_SHUTDOWN))
2379                         err = 0;
2380                 unix_state_unlock(sk);
2381                 goto out;
2382         }
2383
2384         if (wq_has_sleeper(&u->peer_wait))
2385                 wake_up_interruptible_sync_poll(&u->peer_wait,
2386                                                 EPOLLOUT | EPOLLWRNORM |
2387                                                 EPOLLWRBAND);
2388
2389         if (msg->msg_name)
2390                 unix_copy_addr(msg, skb->sk);
2391
2392         if (size > skb->len - skip)
2393                 size = skb->len - skip;
2394         else if (size < skb->len - skip)
2395                 msg->msg_flags |= MSG_TRUNC;
2396
2397         err = skb_copy_datagram_msg(skb, skip, msg, size);
2398         if (err)
2399                 goto out_free;
2400
2401         if (sock_flag(sk, SOCK_RCVTSTAMP))
2402                 __sock_recv_timestamp(msg, sk, skb);
2403
2404         memset(&scm, 0, sizeof(scm));
2405
2406         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2407         unix_set_secdata(&scm, skb);
2408
2409         if (!(flags & MSG_PEEK)) {
2410                 if (UNIXCB(skb).fp)
2411                         unix_detach_fds(&scm, skb);
2412
2413                 sk_peek_offset_bwd(sk, skb->len);
2414         } else {
2415                 /* It is questionable: on PEEK we could:
2416                    - do not return fds - good, but too simple 8)
2417                    - return fds, and do not return them on read (old strategy,
2418                      apparently wrong)
2419                    - clone fds (I chose it for now, it is the most universal
2420                      solution)
2421
2422                    POSIX 1003.1g does not actually define this clearly
2423                    at all. POSIX 1003.1g doesn't define a lot of things
2424                    clearly however!
2425
2426                 */
2427
2428                 sk_peek_offset_fwd(sk, size);
2429
2430                 if (UNIXCB(skb).fp)
2431                         unix_peek_fds(&scm, skb);
2432         }
2433         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2434
2435         scm_recv_unix(sock, msg, &scm, flags);
2436
2437 out_free:
2438         skb_free_datagram(sk, skb);
2439         mutex_unlock(&u->iolock);
2440 out:
2441         return err;
2442 }
2443
2444 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2445                               int flags)
2446 {
2447         struct sock *sk = sock->sk;
2448
2449 #ifdef CONFIG_BPF_SYSCALL
2450         const struct proto *prot = READ_ONCE(sk->sk_prot);
2451
2452         if (prot != &unix_dgram_proto)
2453                 return prot->recvmsg(sk, msg, size, flags, NULL);
2454 #endif
2455         return __unix_dgram_recvmsg(sk, msg, size, flags);
2456 }
2457
2458 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2459 {
2460         struct unix_sock *u = unix_sk(sk);
2461         struct sk_buff *skb;
2462         int err;
2463
2464         mutex_lock(&u->iolock);
2465         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2466         mutex_unlock(&u->iolock);
2467         if (!skb)
2468                 return err;
2469
2470         return recv_actor(sk, skb);
2471 }
2472
2473 /*
2474  *      Sleep until more data has arrived. But check for races..
2475  */
2476 static long unix_stream_data_wait(struct sock *sk, long timeo,
2477                                   struct sk_buff *last, unsigned int last_len,
2478                                   bool freezable)
2479 {
2480         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2481         struct sk_buff *tail;
2482         DEFINE_WAIT(wait);
2483
2484         unix_state_lock(sk);
2485
2486         for (;;) {
2487                 prepare_to_wait(sk_sleep(sk), &wait, state);
2488
2489                 tail = skb_peek_tail(&sk->sk_receive_queue);
2490                 if (tail != last ||
2491                     (tail && tail->len != last_len) ||
2492                     sk->sk_err ||
2493                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2494                     signal_pending(current) ||
2495                     !timeo)
2496                         break;
2497
2498                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2499                 unix_state_unlock(sk);
2500                 timeo = schedule_timeout(timeo);
2501                 unix_state_lock(sk);
2502
2503                 if (sock_flag(sk, SOCK_DEAD))
2504                         break;
2505
2506                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2507         }
2508
2509         finish_wait(sk_sleep(sk), &wait);
2510         unix_state_unlock(sk);
2511         return timeo;
2512 }
2513
2514 static unsigned int unix_skb_len(const struct sk_buff *skb)
2515 {
2516         return skb->len - UNIXCB(skb).consumed;
2517 }
2518
2519 struct unix_stream_read_state {
2520         int (*recv_actor)(struct sk_buff *, int, int,
2521                           struct unix_stream_read_state *);
2522         struct socket *socket;
2523         struct msghdr *msg;
2524         struct pipe_inode_info *pipe;
2525         size_t size;
2526         int flags;
2527         unsigned int splice_flags;
2528 };
2529
2530 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2531 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2532 {
2533         struct socket *sock = state->socket;
2534         struct sock *sk = sock->sk;
2535         struct unix_sock *u = unix_sk(sk);
2536         int chunk = 1;
2537         struct sk_buff *oob_skb;
2538
2539         mutex_lock(&u->iolock);
2540         unix_state_lock(sk);
2541
2542         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2543                 unix_state_unlock(sk);
2544                 mutex_unlock(&u->iolock);
2545                 return -EINVAL;
2546         }
2547
2548         oob_skb = u->oob_skb;
2549
2550         if (!(state->flags & MSG_PEEK))
2551                 WRITE_ONCE(u->oob_skb, NULL);
2552         else
2553                 skb_get(oob_skb);
2554         unix_state_unlock(sk);
2555
2556         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2557
2558         if (!(state->flags & MSG_PEEK))
2559                 UNIXCB(oob_skb).consumed += 1;
2560
2561         consume_skb(oob_skb);
2562
2563         mutex_unlock(&u->iolock);
2564
2565         if (chunk < 0)
2566                 return -EFAULT;
2567
2568         state->msg->msg_flags |= MSG_OOB;
2569         return 1;
2570 }
2571
2572 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2573                                   int flags, int copied)
2574 {
2575         struct unix_sock *u = unix_sk(sk);
2576
2577         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2578                 skb_unlink(skb, &sk->sk_receive_queue);
2579                 consume_skb(skb);
2580                 skb = NULL;
2581         } else {
2582                 if (skb == u->oob_skb) {
2583                         if (copied) {
2584                                 skb = NULL;
2585                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2586                                 if (!(flags & MSG_PEEK)) {
2587                                         WRITE_ONCE(u->oob_skb, NULL);
2588                                         consume_skb(skb);
2589                                 }
2590                         } else if (!(flags & MSG_PEEK)) {
2591                                 skb_unlink(skb, &sk->sk_receive_queue);
2592                                 consume_skb(skb);
2593                                 skb = skb_peek(&sk->sk_receive_queue);
2594                         }
2595                 }
2596         }
2597         return skb;
2598 }
2599 #endif
2600
2601 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2602 {
2603         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2604                 return -ENOTCONN;
2605
2606         return unix_read_skb(sk, recv_actor);
2607 }
2608
2609 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2610                                     bool freezable)
2611 {
2612         struct scm_cookie scm;
2613         struct socket *sock = state->socket;
2614         struct sock *sk = sock->sk;
2615         struct unix_sock *u = unix_sk(sk);
2616         int copied = 0;
2617         int flags = state->flags;
2618         int noblock = flags & MSG_DONTWAIT;
2619         bool check_creds = false;
2620         int target;
2621         int err = 0;
2622         long timeo;
2623         int skip;
2624         size_t size = state->size;
2625         unsigned int last_len;
2626
2627         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2628                 err = -EINVAL;
2629                 goto out;
2630         }
2631
2632         if (unlikely(flags & MSG_OOB)) {
2633                 err = -EOPNOTSUPP;
2634 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2635                 err = unix_stream_recv_urg(state);
2636 #endif
2637                 goto out;
2638         }
2639
2640         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2641         timeo = sock_rcvtimeo(sk, noblock);
2642
2643         memset(&scm, 0, sizeof(scm));
2644
2645         /* Lock the socket to prevent queue disordering
2646          * while sleeps in memcpy_tomsg
2647          */
2648         mutex_lock(&u->iolock);
2649
2650         skip = max(sk_peek_offset(sk, flags), 0);
2651
2652         do {
2653                 int chunk;
2654                 bool drop_skb;
2655                 struct sk_buff *skb, *last;
2656
2657 redo:
2658                 unix_state_lock(sk);
2659                 if (sock_flag(sk, SOCK_DEAD)) {
2660                         err = -ECONNRESET;
2661                         goto unlock;
2662                 }
2663                 last = skb = skb_peek(&sk->sk_receive_queue);
2664                 last_len = last ? last->len : 0;
2665
2666 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2667                 if (skb) {
2668                         skb = manage_oob(skb, sk, flags, copied);
2669                         if (!skb) {
2670                                 unix_state_unlock(sk);
2671                                 if (copied)
2672                                         break;
2673                                 goto redo;
2674                         }
2675                 }
2676 #endif
2677 again:
2678                 if (skb == NULL) {
2679                         if (copied >= target)
2680                                 goto unlock;
2681
2682                         /*
2683                          *      POSIX 1003.1g mandates this order.
2684                          */
2685
2686                         err = sock_error(sk);
2687                         if (err)
2688                                 goto unlock;
2689                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2690                                 goto unlock;
2691
2692                         unix_state_unlock(sk);
2693                         if (!timeo) {
2694                                 err = -EAGAIN;
2695                                 break;
2696                         }
2697
2698                         mutex_unlock(&u->iolock);
2699
2700                         timeo = unix_stream_data_wait(sk, timeo, last,
2701                                                       last_len, freezable);
2702
2703                         if (signal_pending(current)) {
2704                                 err = sock_intr_errno(timeo);
2705                                 scm_destroy(&scm);
2706                                 goto out;
2707                         }
2708
2709                         mutex_lock(&u->iolock);
2710                         goto redo;
2711 unlock:
2712                         unix_state_unlock(sk);
2713                         break;
2714                 }
2715
2716                 while (skip >= unix_skb_len(skb)) {
2717                         skip -= unix_skb_len(skb);
2718                         last = skb;
2719                         last_len = skb->len;
2720                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2721                         if (!skb)
2722                                 goto again;
2723                 }
2724
2725                 unix_state_unlock(sk);
2726
2727                 if (check_creds) {
2728                         /* Never glue messages from different writers */
2729                         if (!unix_skb_scm_eq(skb, &scm))
2730                                 break;
2731                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2732                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2733                         /* Copy credentials */
2734                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2735                         unix_set_secdata(&scm, skb);
2736                         check_creds = true;
2737                 }
2738
2739                 /* Copy address just once */
2740                 if (state->msg && state->msg->msg_name) {
2741                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2742                                          state->msg->msg_name);
2743                         unix_copy_addr(state->msg, skb->sk);
2744                         sunaddr = NULL;
2745                 }
2746
2747                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2748                 skb_get(skb);
2749                 chunk = state->recv_actor(skb, skip, chunk, state);
2750                 drop_skb = !unix_skb_len(skb);
2751                 /* skb is only safe to use if !drop_skb */
2752                 consume_skb(skb);
2753                 if (chunk < 0) {
2754                         if (copied == 0)
2755                                 copied = -EFAULT;
2756                         break;
2757                 }
2758                 copied += chunk;
2759                 size -= chunk;
2760
2761                 if (drop_skb) {
2762                         /* the skb was touched by a concurrent reader;
2763                          * we should not expect anything from this skb
2764                          * anymore and assume it invalid - we can be
2765                          * sure it was dropped from the socket queue
2766                          *
2767                          * let's report a short read
2768                          */
2769                         err = 0;
2770                         break;
2771                 }
2772
2773                 /* Mark read part of skb as used */
2774                 if (!(flags & MSG_PEEK)) {
2775                         UNIXCB(skb).consumed += chunk;
2776
2777                         sk_peek_offset_bwd(sk, chunk);
2778
2779                         if (UNIXCB(skb).fp) {
2780                                 scm_stat_del(sk, skb);
2781                                 unix_detach_fds(&scm, skb);
2782                         }
2783
2784                         if (unix_skb_len(skb))
2785                                 break;
2786
2787                         skb_unlink(skb, &sk->sk_receive_queue);
2788                         consume_skb(skb);
2789
2790                         if (scm.fp)
2791                                 break;
2792                 } else {
2793                         /* It is questionable, see note in unix_dgram_recvmsg.
2794                          */
2795                         if (UNIXCB(skb).fp)
2796                                 unix_peek_fds(&scm, skb);
2797
2798                         sk_peek_offset_fwd(sk, chunk);
2799
2800                         if (UNIXCB(skb).fp)
2801                                 break;
2802
2803                         skip = 0;
2804                         last = skb;
2805                         last_len = skb->len;
2806                         unix_state_lock(sk);
2807                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2808                         if (skb)
2809                                 goto again;
2810                         unix_state_unlock(sk);
2811                         break;
2812                 }
2813         } while (size);
2814
2815         mutex_unlock(&u->iolock);
2816         if (state->msg)
2817                 scm_recv_unix(sock, state->msg, &scm, flags);
2818         else
2819                 scm_destroy(&scm);
2820 out:
2821         return copied ? : err;
2822 }
2823
2824 static int unix_stream_read_actor(struct sk_buff *skb,
2825                                   int skip, int chunk,
2826                                   struct unix_stream_read_state *state)
2827 {
2828         int ret;
2829
2830         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2831                                     state->msg, chunk);
2832         return ret ?: chunk;
2833 }
2834
2835 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2836                           size_t size, int flags)
2837 {
2838         struct unix_stream_read_state state = {
2839                 .recv_actor = unix_stream_read_actor,
2840                 .socket = sk->sk_socket,
2841                 .msg = msg,
2842                 .size = size,
2843                 .flags = flags
2844         };
2845
2846         return unix_stream_read_generic(&state, true);
2847 }
2848
2849 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2850                                size_t size, int flags)
2851 {
2852         struct unix_stream_read_state state = {
2853                 .recv_actor = unix_stream_read_actor,
2854                 .socket = sock,
2855                 .msg = msg,
2856                 .size = size,
2857                 .flags = flags
2858         };
2859
2860 #ifdef CONFIG_BPF_SYSCALL
2861         struct sock *sk = sock->sk;
2862         const struct proto *prot = READ_ONCE(sk->sk_prot);
2863
2864         if (prot != &unix_stream_proto)
2865                 return prot->recvmsg(sk, msg, size, flags, NULL);
2866 #endif
2867         return unix_stream_read_generic(&state, true);
2868 }
2869
2870 static int unix_stream_splice_actor(struct sk_buff *skb,
2871                                     int skip, int chunk,
2872                                     struct unix_stream_read_state *state)
2873 {
2874         return skb_splice_bits(skb, state->socket->sk,
2875                                UNIXCB(skb).consumed + skip,
2876                                state->pipe, chunk, state->splice_flags);
2877 }
2878
2879 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2880                                        struct pipe_inode_info *pipe,
2881                                        size_t size, unsigned int flags)
2882 {
2883         struct unix_stream_read_state state = {
2884                 .recv_actor = unix_stream_splice_actor,
2885                 .socket = sock,
2886                 .pipe = pipe,
2887                 .size = size,
2888                 .splice_flags = flags,
2889         };
2890
2891         if (unlikely(*ppos))
2892                 return -ESPIPE;
2893
2894         if (sock->file->f_flags & O_NONBLOCK ||
2895             flags & SPLICE_F_NONBLOCK)
2896                 state.flags = MSG_DONTWAIT;
2897
2898         return unix_stream_read_generic(&state, false);
2899 }
2900
2901 static int unix_shutdown(struct socket *sock, int mode)
2902 {
2903         struct sock *sk = sock->sk;
2904         struct sock *other;
2905
2906         if (mode < SHUT_RD || mode > SHUT_RDWR)
2907                 return -EINVAL;
2908         /* This maps:
2909          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2910          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2911          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2912          */
2913         ++mode;
2914
2915         unix_state_lock(sk);
2916         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2917         other = unix_peer(sk);
2918         if (other)
2919                 sock_hold(other);
2920         unix_state_unlock(sk);
2921         sk->sk_state_change(sk);
2922
2923         if (other &&
2924                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2925
2926                 int peer_mode = 0;
2927                 const struct proto *prot = READ_ONCE(other->sk_prot);
2928
2929                 if (prot->unhash)
2930                         prot->unhash(other);
2931                 if (mode&RCV_SHUTDOWN)
2932                         peer_mode |= SEND_SHUTDOWN;
2933                 if (mode&SEND_SHUTDOWN)
2934                         peer_mode |= RCV_SHUTDOWN;
2935                 unix_state_lock(other);
2936                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2937                 unix_state_unlock(other);
2938                 other->sk_state_change(other);
2939                 if (peer_mode == SHUTDOWN_MASK)
2940                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2941                 else if (peer_mode & RCV_SHUTDOWN)
2942                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2943         }
2944         if (other)
2945                 sock_put(other);
2946
2947         return 0;
2948 }
2949
2950 long unix_inq_len(struct sock *sk)
2951 {
2952         struct sk_buff *skb;
2953         long amount = 0;
2954
2955         if (sk->sk_state == TCP_LISTEN)
2956                 return -EINVAL;
2957
2958         spin_lock(&sk->sk_receive_queue.lock);
2959         if (sk->sk_type == SOCK_STREAM ||
2960             sk->sk_type == SOCK_SEQPACKET) {
2961                 skb_queue_walk(&sk->sk_receive_queue, skb)
2962                         amount += unix_skb_len(skb);
2963         } else {
2964                 skb = skb_peek(&sk->sk_receive_queue);
2965                 if (skb)
2966                         amount = skb->len;
2967         }
2968         spin_unlock(&sk->sk_receive_queue.lock);
2969
2970         return amount;
2971 }
2972 EXPORT_SYMBOL_GPL(unix_inq_len);
2973
2974 long unix_outq_len(struct sock *sk)
2975 {
2976         return sk_wmem_alloc_get(sk);
2977 }
2978 EXPORT_SYMBOL_GPL(unix_outq_len);
2979
2980 static int unix_open_file(struct sock *sk)
2981 {
2982         struct path path;
2983         struct file *f;
2984         int fd;
2985
2986         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2987                 return -EPERM;
2988
2989         if (!smp_load_acquire(&unix_sk(sk)->addr))
2990                 return -ENOENT;
2991
2992         path = unix_sk(sk)->path;
2993         if (!path.dentry)
2994                 return -ENOENT;
2995
2996         path_get(&path);
2997
2998         fd = get_unused_fd_flags(O_CLOEXEC);
2999         if (fd < 0)
3000                 goto out;
3001
3002         f = dentry_open(&path, O_PATH, current_cred());
3003         if (IS_ERR(f)) {
3004                 put_unused_fd(fd);
3005                 fd = PTR_ERR(f);
3006                 goto out;
3007         }
3008
3009         fd_install(fd, f);
3010 out:
3011         path_put(&path);
3012
3013         return fd;
3014 }
3015
3016 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3017 {
3018         struct sock *sk = sock->sk;
3019         long amount = 0;
3020         int err;
3021
3022         switch (cmd) {
3023         case SIOCOUTQ:
3024                 amount = unix_outq_len(sk);
3025                 err = put_user(amount, (int __user *)arg);
3026                 break;
3027         case SIOCINQ:
3028                 amount = unix_inq_len(sk);
3029                 if (amount < 0)
3030                         err = amount;
3031                 else
3032                         err = put_user(amount, (int __user *)arg);
3033                 break;
3034         case SIOCUNIXFILE:
3035                 err = unix_open_file(sk);
3036                 break;
3037 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3038         case SIOCATMARK:
3039                 {
3040                         struct sk_buff *skb;
3041                         int answ = 0;
3042
3043                         skb = skb_peek(&sk->sk_receive_queue);
3044                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3045                                 answ = 1;
3046                         err = put_user(answ, (int __user *)arg);
3047                 }
3048                 break;
3049 #endif
3050         default:
3051                 err = -ENOIOCTLCMD;
3052                 break;
3053         }
3054         return err;
3055 }
3056
3057 #ifdef CONFIG_COMPAT
3058 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3059 {
3060         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3061 }
3062 #endif
3063
3064 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3065 {
3066         struct sock *sk = sock->sk;
3067         __poll_t mask;
3068         u8 shutdown;
3069
3070         sock_poll_wait(file, sock, wait);
3071         mask = 0;
3072         shutdown = READ_ONCE(sk->sk_shutdown);
3073
3074         /* exceptional events? */
3075         if (READ_ONCE(sk->sk_err))
3076                 mask |= EPOLLERR;
3077         if (shutdown == SHUTDOWN_MASK)
3078                 mask |= EPOLLHUP;
3079         if (shutdown & RCV_SHUTDOWN)
3080                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3081
3082         /* readable? */
3083         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3084                 mask |= EPOLLIN | EPOLLRDNORM;
3085         if (sk_is_readable(sk))
3086                 mask |= EPOLLIN | EPOLLRDNORM;
3087 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3088         if (READ_ONCE(unix_sk(sk)->oob_skb))
3089                 mask |= EPOLLPRI;
3090 #endif
3091
3092         /* Connection-based need to check for termination and startup */
3093         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3094             sk->sk_state == TCP_CLOSE)
3095                 mask |= EPOLLHUP;
3096
3097         /*
3098          * we set writable also when the other side has shut down the
3099          * connection. This prevents stuck sockets.
3100          */
3101         if (unix_writable(sk))
3102                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3103
3104         return mask;
3105 }
3106
3107 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3108                                     poll_table *wait)
3109 {
3110         struct sock *sk = sock->sk, *other;
3111         unsigned int writable;
3112         __poll_t mask;
3113         u8 shutdown;
3114
3115         sock_poll_wait(file, sock, wait);
3116         mask = 0;
3117         shutdown = READ_ONCE(sk->sk_shutdown);
3118
3119         /* exceptional events? */
3120         if (READ_ONCE(sk->sk_err) ||
3121             !skb_queue_empty_lockless(&sk->sk_error_queue))
3122                 mask |= EPOLLERR |
3123                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3124
3125         if (shutdown & RCV_SHUTDOWN)
3126                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3127         if (shutdown == SHUTDOWN_MASK)
3128                 mask |= EPOLLHUP;
3129
3130         /* readable? */
3131         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3132                 mask |= EPOLLIN | EPOLLRDNORM;
3133         if (sk_is_readable(sk))
3134                 mask |= EPOLLIN | EPOLLRDNORM;
3135
3136         /* Connection-based need to check for termination and startup */
3137         if (sk->sk_type == SOCK_SEQPACKET) {
3138                 if (sk->sk_state == TCP_CLOSE)
3139                         mask |= EPOLLHUP;
3140                 /* connection hasn't started yet? */
3141                 if (sk->sk_state == TCP_SYN_SENT)
3142                         return mask;
3143         }
3144
3145         /* No write status requested, avoid expensive OUT tests. */
3146         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3147                 return mask;
3148
3149         writable = unix_writable(sk);
3150         if (writable) {
3151                 unix_state_lock(sk);
3152
3153                 other = unix_peer(sk);
3154                 if (other && unix_peer(other) != sk &&
3155                     unix_recvq_full_lockless(other) &&
3156                     unix_dgram_peer_wake_me(sk, other))
3157                         writable = 0;
3158
3159                 unix_state_unlock(sk);
3160         }
3161
3162         if (writable)
3163                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3164         else
3165                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3166
3167         return mask;
3168 }
3169
3170 #ifdef CONFIG_PROC_FS
3171
3172 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3173
3174 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3175 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3176 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3177
3178 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3179 {
3180         unsigned long offset = get_offset(*pos);
3181         unsigned long bucket = get_bucket(*pos);
3182         unsigned long count = 0;
3183         struct sock *sk;
3184
3185         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3186              sk; sk = sk_next(sk)) {
3187                 if (++count == offset)
3188                         break;
3189         }
3190
3191         return sk;
3192 }
3193
3194 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3195 {
3196         unsigned long bucket = get_bucket(*pos);
3197         struct net *net = seq_file_net(seq);
3198         struct sock *sk;
3199
3200         while (bucket < UNIX_HASH_SIZE) {
3201                 spin_lock(&net->unx.table.locks[bucket]);
3202
3203                 sk = unix_from_bucket(seq, pos);
3204                 if (sk)
3205                         return sk;
3206
3207                 spin_unlock(&net->unx.table.locks[bucket]);
3208
3209                 *pos = set_bucket_offset(++bucket, 1);
3210         }
3211
3212         return NULL;
3213 }
3214
3215 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3216                                   loff_t *pos)
3217 {
3218         unsigned long bucket = get_bucket(*pos);
3219
3220         sk = sk_next(sk);
3221         if (sk)
3222                 return sk;
3223
3224
3225         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3226
3227         *pos = set_bucket_offset(++bucket, 1);
3228
3229         return unix_get_first(seq, pos);
3230 }
3231
3232 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3233 {
3234         if (!*pos)
3235                 return SEQ_START_TOKEN;
3236
3237         return unix_get_first(seq, pos);
3238 }
3239
3240 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3241 {
3242         ++*pos;
3243
3244         if (v == SEQ_START_TOKEN)
3245                 return unix_get_first(seq, pos);
3246
3247         return unix_get_next(seq, v, pos);
3248 }
3249
3250 static void unix_seq_stop(struct seq_file *seq, void *v)
3251 {
3252         struct sock *sk = v;
3253
3254         if (sk)
3255                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3256 }
3257
3258 static int unix_seq_show(struct seq_file *seq, void *v)
3259 {
3260
3261         if (v == SEQ_START_TOKEN)
3262                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3263                          "Inode Path\n");
3264         else {
3265                 struct sock *s = v;
3266                 struct unix_sock *u = unix_sk(s);
3267                 unix_state_lock(s);
3268
3269                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3270                         s,
3271                         refcount_read(&s->sk_refcnt),
3272                         0,
3273                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3274                         s->sk_type,
3275                         s->sk_socket ?
3276                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3277                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3278                         sock_i_ino(s));
3279
3280                 if (u->addr) {  // under a hash table lock here
3281                         int i, len;
3282                         seq_putc(seq, ' ');
3283
3284                         i = 0;
3285                         len = u->addr->len -
3286                                 offsetof(struct sockaddr_un, sun_path);
3287                         if (u->addr->name->sun_path[0]) {
3288                                 len--;
3289                         } else {
3290                                 seq_putc(seq, '@');
3291                                 i++;
3292                         }
3293                         for ( ; i < len; i++)
3294                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3295                                          '@');
3296                 }
3297                 unix_state_unlock(s);
3298                 seq_putc(seq, '\n');
3299         }
3300
3301         return 0;
3302 }
3303
3304 static const struct seq_operations unix_seq_ops = {
3305         .start  = unix_seq_start,
3306         .next   = unix_seq_next,
3307         .stop   = unix_seq_stop,
3308         .show   = unix_seq_show,
3309 };
3310
3311 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3312 struct bpf_unix_iter_state {
3313         struct seq_net_private p;
3314         unsigned int cur_sk;
3315         unsigned int end_sk;
3316         unsigned int max_sk;
3317         struct sock **batch;
3318         bool st_bucket_done;
3319 };
3320
3321 struct bpf_iter__unix {
3322         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3323         __bpf_md_ptr(struct unix_sock *, unix_sk);
3324         uid_t uid __aligned(8);
3325 };
3326
3327 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3328                               struct unix_sock *unix_sk, uid_t uid)
3329 {
3330         struct bpf_iter__unix ctx;
3331
3332         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3333         ctx.meta = meta;
3334         ctx.unix_sk = unix_sk;
3335         ctx.uid = uid;
3336         return bpf_iter_run_prog(prog, &ctx);
3337 }
3338
3339 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3340
3341 {
3342         struct bpf_unix_iter_state *iter = seq->private;
3343         unsigned int expected = 1;
3344         struct sock *sk;
3345
3346         sock_hold(start_sk);
3347         iter->batch[iter->end_sk++] = start_sk;
3348
3349         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3350                 if (iter->end_sk < iter->max_sk) {
3351                         sock_hold(sk);
3352                         iter->batch[iter->end_sk++] = sk;
3353                 }
3354
3355                 expected++;
3356         }
3357
3358         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3359
3360         return expected;
3361 }
3362
3363 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3364 {
3365         while (iter->cur_sk < iter->end_sk)
3366                 sock_put(iter->batch[iter->cur_sk++]);
3367 }
3368
3369 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3370                                        unsigned int new_batch_sz)
3371 {
3372         struct sock **new_batch;
3373
3374         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3375                              GFP_USER | __GFP_NOWARN);
3376         if (!new_batch)
3377                 return -ENOMEM;
3378
3379         bpf_iter_unix_put_batch(iter);
3380         kvfree(iter->batch);
3381         iter->batch = new_batch;
3382         iter->max_sk = new_batch_sz;
3383
3384         return 0;
3385 }
3386
3387 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3388                                         loff_t *pos)
3389 {
3390         struct bpf_unix_iter_state *iter = seq->private;
3391         unsigned int expected;
3392         bool resized = false;
3393         struct sock *sk;
3394
3395         if (iter->st_bucket_done)
3396                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3397
3398 again:
3399         /* Get a new batch */
3400         iter->cur_sk = 0;
3401         iter->end_sk = 0;
3402
3403         sk = unix_get_first(seq, pos);
3404         if (!sk)
3405                 return NULL; /* Done */
3406
3407         expected = bpf_iter_unix_hold_batch(seq, sk);
3408
3409         if (iter->end_sk == expected) {
3410                 iter->st_bucket_done = true;
3411                 return sk;
3412         }
3413
3414         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3415                 resized = true;
3416                 goto again;
3417         }
3418
3419         return sk;
3420 }
3421
3422 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3423 {
3424         if (!*pos)
3425                 return SEQ_START_TOKEN;
3426
3427         /* bpf iter does not support lseek, so it always
3428          * continue from where it was stop()-ped.
3429          */
3430         return bpf_iter_unix_batch(seq, pos);
3431 }
3432
3433 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3434 {
3435         struct bpf_unix_iter_state *iter = seq->private;
3436         struct sock *sk;
3437
3438         /* Whenever seq_next() is called, the iter->cur_sk is
3439          * done with seq_show(), so advance to the next sk in
3440          * the batch.
3441          */
3442         if (iter->cur_sk < iter->end_sk)
3443                 sock_put(iter->batch[iter->cur_sk++]);
3444
3445         ++*pos;
3446
3447         if (iter->cur_sk < iter->end_sk)
3448                 sk = iter->batch[iter->cur_sk];
3449         else
3450                 sk = bpf_iter_unix_batch(seq, pos);
3451
3452         return sk;
3453 }
3454
3455 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3456 {
3457         struct bpf_iter_meta meta;
3458         struct bpf_prog *prog;
3459         struct sock *sk = v;
3460         uid_t uid;
3461         bool slow;
3462         int ret;
3463
3464         if (v == SEQ_START_TOKEN)
3465                 return 0;
3466
3467         slow = lock_sock_fast(sk);
3468
3469         if (unlikely(sk_unhashed(sk))) {
3470                 ret = SEQ_SKIP;
3471                 goto unlock;
3472         }
3473
3474         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3475         meta.seq = seq;
3476         prog = bpf_iter_get_info(&meta, false);
3477         ret = unix_prog_seq_show(prog, &meta, v, uid);
3478 unlock:
3479         unlock_sock_fast(sk, slow);
3480         return ret;
3481 }
3482
3483 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3484 {
3485         struct bpf_unix_iter_state *iter = seq->private;
3486         struct bpf_iter_meta meta;
3487         struct bpf_prog *prog;
3488
3489         if (!v) {
3490                 meta.seq = seq;
3491                 prog = bpf_iter_get_info(&meta, true);
3492                 if (prog)
3493                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3494         }
3495
3496         if (iter->cur_sk < iter->end_sk)
3497                 bpf_iter_unix_put_batch(iter);
3498 }
3499
3500 static const struct seq_operations bpf_iter_unix_seq_ops = {
3501         .start  = bpf_iter_unix_seq_start,
3502         .next   = bpf_iter_unix_seq_next,
3503         .stop   = bpf_iter_unix_seq_stop,
3504         .show   = bpf_iter_unix_seq_show,
3505 };
3506 #endif
3507 #endif
3508
3509 static const struct net_proto_family unix_family_ops = {
3510         .family = PF_UNIX,
3511         .create = unix_create,
3512         .owner  = THIS_MODULE,
3513 };
3514
3515
3516 static int __net_init unix_net_init(struct net *net)
3517 {
3518         int i;
3519
3520         net->unx.sysctl_max_dgram_qlen = 10;
3521         if (unix_sysctl_register(net))
3522                 goto out;
3523
3524 #ifdef CONFIG_PROC_FS
3525         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3526                              sizeof(struct seq_net_private)))
3527                 goto err_sysctl;
3528 #endif
3529
3530         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3531                                               sizeof(spinlock_t), GFP_KERNEL);
3532         if (!net->unx.table.locks)
3533                 goto err_proc;
3534
3535         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3536                                                 sizeof(struct hlist_head),
3537                                                 GFP_KERNEL);
3538         if (!net->unx.table.buckets)
3539                 goto free_locks;
3540
3541         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3542                 spin_lock_init(&net->unx.table.locks[i]);
3543                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3544         }
3545
3546         return 0;
3547
3548 free_locks:
3549         kvfree(net->unx.table.locks);
3550 err_proc:
3551 #ifdef CONFIG_PROC_FS
3552         remove_proc_entry("unix", net->proc_net);
3553 err_sysctl:
3554 #endif
3555         unix_sysctl_unregister(net);
3556 out:
3557         return -ENOMEM;
3558 }
3559
3560 static void __net_exit unix_net_exit(struct net *net)
3561 {
3562         kvfree(net->unx.table.buckets);
3563         kvfree(net->unx.table.locks);
3564         unix_sysctl_unregister(net);
3565         remove_proc_entry("unix", net->proc_net);
3566 }
3567
3568 static struct pernet_operations unix_net_ops = {
3569         .init = unix_net_init,
3570         .exit = unix_net_exit,
3571 };
3572
3573 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3574 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3575                      struct unix_sock *unix_sk, uid_t uid)
3576
3577 #define INIT_BATCH_SZ 16
3578
3579 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3580 {
3581         struct bpf_unix_iter_state *iter = priv_data;
3582         int err;
3583
3584         err = bpf_iter_init_seq_net(priv_data, aux);
3585         if (err)
3586                 return err;
3587
3588         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3589         if (err) {
3590                 bpf_iter_fini_seq_net(priv_data);
3591                 return err;
3592         }
3593
3594         return 0;
3595 }
3596
3597 static void bpf_iter_fini_unix(void *priv_data)
3598 {
3599         struct bpf_unix_iter_state *iter = priv_data;
3600
3601         bpf_iter_fini_seq_net(priv_data);
3602         kvfree(iter->batch);
3603 }
3604
3605 static const struct bpf_iter_seq_info unix_seq_info = {
3606         .seq_ops                = &bpf_iter_unix_seq_ops,
3607         .init_seq_private       = bpf_iter_init_unix,
3608         .fini_seq_private       = bpf_iter_fini_unix,
3609         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3610 };
3611
3612 static const struct bpf_func_proto *
3613 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3614                              const struct bpf_prog *prog)
3615 {
3616         switch (func_id) {
3617         case BPF_FUNC_setsockopt:
3618                 return &bpf_sk_setsockopt_proto;
3619         case BPF_FUNC_getsockopt:
3620                 return &bpf_sk_getsockopt_proto;
3621         default:
3622                 return NULL;
3623         }
3624 }
3625
3626 static struct bpf_iter_reg unix_reg_info = {
3627         .target                 = "unix",
3628         .ctx_arg_info_size      = 1,
3629         .ctx_arg_info           = {
3630                 { offsetof(struct bpf_iter__unix, unix_sk),
3631                   PTR_TO_BTF_ID_OR_NULL },
3632         },
3633         .get_func_proto         = bpf_iter_unix_get_func_proto,
3634         .seq_info               = &unix_seq_info,
3635 };
3636
3637 static void __init bpf_iter_register(void)
3638 {
3639         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3640         if (bpf_iter_reg_target(&unix_reg_info))
3641                 pr_warn("Warning: could not register bpf iterator unix\n");
3642 }
3643 #endif
3644
3645 static int __init af_unix_init(void)
3646 {
3647         int i, rc = -1;
3648
3649         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3650
3651         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3652                 spin_lock_init(&bsd_socket_locks[i]);
3653                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3654         }
3655
3656         rc = proto_register(&unix_dgram_proto, 1);
3657         if (rc != 0) {
3658                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3659                 goto out;
3660         }
3661
3662         rc = proto_register(&unix_stream_proto, 1);
3663         if (rc != 0) {
3664                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3665                 proto_unregister(&unix_dgram_proto);
3666                 goto out;
3667         }
3668
3669         sock_register(&unix_family_ops);
3670         register_pernet_subsys(&unix_net_ops);
3671         unix_bpf_build_proto();
3672
3673 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3674         bpf_iter_register();
3675 #endif
3676
3677 out:
3678         return rc;
3679 }
3680
3681 static void __exit af_unix_exit(void)
3682 {
3683         sock_unregister(PF_UNIX);
3684         proto_unregister(&unix_dgram_proto);
3685         proto_unregister(&unix_stream_proto);
3686         unregister_pernet_subsys(&unix_net_ops);
3687 }
3688
3689 /* Earlier than device_initcall() so that other drivers invoking
3690    request_module() don't end up in a loop when modprobe tries
3691    to use a UNIX socket. But later than subsys_initcall() because
3692    we depend on stuff initialised there */
3693 fs_initcall(af_unix_init);
3694 module_exit(af_unix_exit);
3695
3696 MODULE_LICENSE("GPL");
3697 MODULE_ALIAS_NETPROTO(PF_UNIX);