net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119
 120 #include "scm.h"
 121
 122 static atomic_long_t unix_nr_socks;
 123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 125
 126 /* SMP locking strategy:
 127  *    hash table is protected with spinlock.
 128  *    each socket state is protected by separate spinlock.
 129  */
 130
 131 static unsigned int unix_unbound_hash(struct sock *sk)
 132 {
 133         unsigned long hash = (unsigned long)sk;
 134
 135         hash ^= hash >> 16;
 136         hash ^= hash >> 8;
 137         hash ^= sk->sk_type;
 138
 139         return hash & UNIX_HASH_MOD;
 140 }
 141
 142 static unsigned int unix_bsd_hash(struct inode *i)
 143 {
 144         return i->i_ino & UNIX_HASH_MOD;
 145 }
 146
 147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 148                                        int addr_len, int type)
 149 {
 150         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 151         unsigned int hash;
 152
 153         hash = (__force unsigned int)csum_fold(csum);
 154         hash ^= hash >> 8;
 155         hash ^= type;
 156
 157         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 158 }
 159
 160 static void unix_table_double_lock(struct net *net,
 161                                    unsigned int hash1, unsigned int hash2)
 162 {
 163         if (hash1 == hash2) {
 164                 spin_lock(&net->unx.table.locks[hash1]);
 165                 return;
 166         }
 167
 168         if (hash1 > hash2)
 169                 swap(hash1, hash2);
 170
 171         spin_lock(&net->unx.table.locks[hash1]);
 172         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 173 }
 174
 175 static void unix_table_double_unlock(struct net *net,
 176                                      unsigned int hash1, unsigned int hash2)
 177 {
 178         if (hash1 == hash2) {
 179                 spin_unlock(&net->unx.table.locks[hash1]);
 180                 return;
 181         }
 182
 183         spin_unlock(&net->unx.table.locks[hash1]);
 184         spin_unlock(&net->unx.table.locks[hash2]);
 185 }
 186
 187 #ifdef CONFIG_SECURITY_NETWORK
 188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 189 {
 190         UNIXCB(skb).secid = scm->secid;
 191 }
 192
 193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 194 {
 195         scm->secid = UNIXCB(skb).secid;
 196 }
 197
 198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 199 {
 200         return (scm->secid == UNIXCB(skb).secid);
 201 }
 202 #else
 203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 204 { }
 205
 206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 207 { }
 208
 209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 210 {
 211         return true;
 212 }
 213 #endif /* CONFIG_SECURITY_NETWORK */
 214
 215 #define unix_peer(sk) (unix_sk(sk)->peer)
 216
 217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 218 {
 219         return unix_peer(osk) == sk;
 220 }
 221
 222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 223 {
 224         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 225 }
 226
 227 static inline int unix_recvq_full(const struct sock *sk)
 228 {
 229         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 230 }
 231
 232 static inline int unix_recvq_full_lockless(const struct sock *sk)
 233 {
 234         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 235                 READ_ONCE(sk->sk_max_ack_backlog);
 236 }
 237
 238 struct sock *unix_peer_get(struct sock *s)
 239 {
 240         struct sock *peer;
 241
 242         unix_state_lock(s);
 243         peer = unix_peer(s);
 244         if (peer)
 245                 sock_hold(peer);
 246         unix_state_unlock(s);
 247         return peer;
 248 }
 249 EXPORT_SYMBOL_GPL(unix_peer_get);
 250
 251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 252                                              int addr_len)
 253 {
 254         struct unix_address *addr;
 255
 256         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 257         if (!addr)
 258                 return NULL;
 259
 260         refcount_set(&addr->refcnt, 1);
 261         addr->len = addr_len;
 262         memcpy(addr->name, sunaddr, addr_len);
 263
 264         return addr;
 265 }
 266
 267 static inline void unix_release_addr(struct unix_address *addr)
 268 {
 269         if (refcount_dec_and_test(&addr->refcnt))
 270                 kfree(addr);
 271 }
 272
 273 /*
 274  *      Check unix socket name:
 275  *              - should be not zero length.
 276  *              - if started by not zero, should be NULL terminated (FS object)
 277  *              - if started by zero, it is abstract name.
 278  */
 279
 280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 281 {
 282         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 283             addr_len > sizeof(*sunaddr))
 284                 return -EINVAL;
 285
 286         if (sunaddr->sun_family != AF_UNIX)
 287                 return -EINVAL;
 288
 289         return 0;
 290 }
 291
 292 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 293 {
 294         /* This may look like an off by one error but it is a bit more
 295          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 296          * sun_path[108] doesn't as such exist.  However in kernel space
 297          * we are guaranteed that it is a valid memory location in our
 298          * kernel address buffer because syscall functions always pass
 299          * a pointer of struct sockaddr_storage which has a bigger buffer
 300          * than 108.
 301          */
 302         ((char *)sunaddr)[addr_len] = 0;
 303 }
 304
 305 static void __unix_remove_socket(struct sock *sk)
 306 {
 307         sk_del_node_init(sk);
 308 }
 309
 310 static void __unix_insert_socket(struct net *net, struct sock *sk)
 311 {
 312         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 313         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 314 }
 315
 316 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 317                                  struct unix_address *addr, unsigned int hash)
 318 {
 319         __unix_remove_socket(sk);
 320         smp_store_release(&unix_sk(sk)->addr, addr);
 321
 322         sk->sk_hash = hash;
 323         __unix_insert_socket(net, sk);
 324 }
 325
 326 static void unix_remove_socket(struct net *net, struct sock *sk)
 327 {
 328         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 329         __unix_remove_socket(sk);
 330         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 331 }
 332
 333 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 334 {
 335         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 336         __unix_insert_socket(net, sk);
 337         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 338 }
 339
 340 static void unix_insert_bsd_socket(struct sock *sk)
 341 {
 342         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 343         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 344         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 345 }
 346
 347 static void unix_remove_bsd_socket(struct sock *sk)
 348 {
 349         if (!hlist_unhashed(&sk->sk_bind_node)) {
 350                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 351                 __sk_del_bind_node(sk);
 352                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 353
 354                 sk_node_init(&sk->sk_bind_node);
 355         }
 356 }
 357
 358 static struct sock *__unix_find_socket_byname(struct net *net,
 359                                               struct sockaddr_un *sunname,
 360                                               int len, unsigned int hash)
 361 {
 362         struct sock *s;
 363
 364         sk_for_each(s, &net->unx.table.buckets[hash]) {
 365                 struct unix_sock *u = unix_sk(s);
 366
 367                 if (u->addr->len == len &&
 368                     !memcmp(u->addr->name, sunname, len))
 369                         return s;
 370         }
 371         return NULL;
 372 }
 373
 374 static inline struct sock *unix_find_socket_byname(struct net *net,
 375                                                    struct sockaddr_un *sunname,
 376                                                    int len, unsigned int hash)
 377 {
 378         struct sock *s;
 379
 380         spin_lock(&net->unx.table.locks[hash]);
 381         s = __unix_find_socket_byname(net, sunname, len, hash);
 382         if (s)
 383                 sock_hold(s);
 384         spin_unlock(&net->unx.table.locks[hash]);
 385         return s;
 386 }
 387
 388 static struct sock *unix_find_socket_byinode(struct inode *i)
 389 {
 390         unsigned int hash = unix_bsd_hash(i);
 391         struct sock *s;
 392
 393         spin_lock(&bsd_socket_locks[hash]);
 394         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 395                 struct dentry *dentry = unix_sk(s)->path.dentry;
 396
 397                 if (dentry && d_backing_inode(dentry) == i) {
 398                         sock_hold(s);
 399                         spin_unlock(&bsd_socket_locks[hash]);
 400                         return s;
 401                 }
 402         }
 403         spin_unlock(&bsd_socket_locks[hash]);
 404         return NULL;
 405 }
 406
 407 /* Support code for asymmetrically connected dgram sockets
 408  *
 409  * If a datagram socket is connected to a socket not itself connected
 410  * to the first socket (eg, /dev/log), clients may only enqueue more
 411  * messages if the present receive queue of the server socket is not
 412  * "too large". This means there's a second writeability condition
 413  * poll and sendmsg need to test. The dgram recv code will do a wake
 414  * up on the peer_wait wait queue of a socket upon reception of a
 415  * datagram which needs to be propagated to sleeping would-be writers
 416  * since these might not have sent anything so far. This can't be
 417  * accomplished via poll_wait because the lifetime of the server
 418  * socket might be less than that of its clients if these break their
 419  * association with it or if the server socket is closed while clients
 420  * are still connected to it and there's no way to inform "a polling
 421  * implementation" that it should let go of a certain wait queue
 422  *
 423  * In order to propagate a wake up, a wait_queue_entry_t of the client
 424  * socket is enqueued on the peer_wait queue of the server socket
 425  * whose wake function does a wake_up on the ordinary client socket
 426  * wait queue. This connection is established whenever a write (or
 427  * poll for write) hit the flow control condition and broken when the
 428  * association to the server socket is dissolved or after a wake up
 429  * was relayed.
 430  */
 431
 432 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 433                                       void *key)
 434 {
 435         struct unix_sock *u;
 436         wait_queue_head_t *u_sleep;
 437
 438         u = container_of(q, struct unix_sock, peer_wake);
 439
 440         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 441                             q);
 442         u->peer_wake.private = NULL;
 443
 444         /* relaying can only happen while the wq still exists */
 445         u_sleep = sk_sleep(&u->sk);
 446         if (u_sleep)
 447                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 448
 449         return 0;
 450 }
 451
 452 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 453 {
 454         struct unix_sock *u, *u_other;
 455         int rc;
 456
 457         u = unix_sk(sk);
 458         u_other = unix_sk(other);
 459         rc = 0;
 460         spin_lock(&u_other->peer_wait.lock);
 461
 462         if (!u->peer_wake.private) {
 463                 u->peer_wake.private = other;
 464                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 465
 466                 rc = 1;
 467         }
 468
 469         spin_unlock(&u_other->peer_wait.lock);
 470         return rc;
 471 }
 472
 473 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 474                                             struct sock *other)
 475 {
 476         struct unix_sock *u, *u_other;
 477
 478         u = unix_sk(sk);
 479         u_other = unix_sk(other);
 480         spin_lock(&u_other->peer_wait.lock);
 481
 482         if (u->peer_wake.private == other) {
 483                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 484                 u->peer_wake.private = NULL;
 485         }
 486
 487         spin_unlock(&u_other->peer_wait.lock);
 488 }
 489
 490 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 491                                                    struct sock *other)
 492 {
 493         unix_dgram_peer_wake_disconnect(sk, other);
 494         wake_up_interruptible_poll(sk_sleep(sk),
 495                                    EPOLLOUT |
 496                                    EPOLLWRNORM |
 497                                    EPOLLWRBAND);
 498 }
 499
 500 /* preconditions:
 501  *      - unix_peer(sk) == other
 502  *      - association is stable
 503  */
 504 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 505 {
 506         int connected;
 507
 508         connected = unix_dgram_peer_wake_connect(sk, other);
 509
 510         /* If other is SOCK_DEAD, we want to make sure we signal
 511          * POLLOUT, such that a subsequent write() can get a
 512          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 513          * to other and its full, we will hang waiting for POLLOUT.
 514          */
 515         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 516                 return 1;
 517
 518         if (connected)
 519                 unix_dgram_peer_wake_disconnect(sk, other);
 520
 521         return 0;
 522 }
 523
 524 static int unix_writable(const struct sock *sk)
 525 {
 526         return sk->sk_state != TCP_LISTEN &&
 527                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 528 }
 529
 530 static void unix_write_space(struct sock *sk)
 531 {
 532         struct socket_wq *wq;
 533
 534         rcu_read_lock();
 535         if (unix_writable(sk)) {
 536                 wq = rcu_dereference(sk->sk_wq);
 537                 if (skwq_has_sleeper(wq))
 538                         wake_up_interruptible_sync_poll(&wq->wait,
 539                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 540                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 541         }
 542         rcu_read_unlock();
 543 }
 544
 545 /* When dgram socket disconnects (or changes its peer), we clear its receive
 546  * queue of packets arrived from previous peer. First, it allows to do
 547  * flow control based only on wmem_alloc; second, sk connected to peer
 548  * may receive messages only from that peer. */
 549 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 550 {
 551         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 552                 skb_queue_purge(&sk->sk_receive_queue);
 553                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 554
 555                 /* If one link of bidirectional dgram pipe is disconnected,
 556                  * we signal error. Messages are lost. Do not make this,
 557                  * when peer was not connected to us.
 558                  */
 559                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 560                         WRITE_ONCE(other->sk_err, ECONNRESET);
 561                         sk_error_report(other);
 562                 }
 563         }
 564         other->sk_state = TCP_CLOSE;
 565 }
 566
 567 static void unix_sock_destructor(struct sock *sk)
 568 {
 569         struct unix_sock *u = unix_sk(sk);
 570
 571         skb_queue_purge(&sk->sk_receive_queue);
 572
 573         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 574         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 575         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 576         if (!sock_flag(sk, SOCK_DEAD)) {
 577                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 578                 return;
 579         }
 580
 581         if (u->addr)
 582                 unix_release_addr(u->addr);
 583
 584         atomic_long_dec(&unix_nr_socks);
 585         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 586 #ifdef UNIX_REFCNT_DEBUG
 587         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 588                 atomic_long_read(&unix_nr_socks));
 589 #endif
 590 }
 591
 592 static void unix_release_sock(struct sock *sk, int embrion)
 593 {
 594         struct unix_sock *u = unix_sk(sk);
 595         struct sock *skpair;
 596         struct sk_buff *skb;
 597         struct path path;
 598         int state;
 599
 600         unix_remove_socket(sock_net(sk), sk);
 601         unix_remove_bsd_socket(sk);
 602
 603         /* Clear state */
 604         unix_state_lock(sk);
 605         sock_orphan(sk);
 606         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 607         path         = u->path;
 608         u->path.dentry = NULL;
 609         u->path.mnt = NULL;
 610         state = sk->sk_state;
 611         sk->sk_state = TCP_CLOSE;
 612
 613         skpair = unix_peer(sk);
 614         unix_peer(sk) = NULL;
 615
 616         unix_state_unlock(sk);
 617
 618 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 619         if (u->oob_skb) {
 620                 kfree_skb(u->oob_skb);
 621                 u->oob_skb = NULL;
 622         }
 623 #endif
 624
 625         wake_up_interruptible_all(&u->peer_wait);
 626
 627         if (skpair != NULL) {
 628                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 629                         unix_state_lock(skpair);
 630                         /* No more writes */
 631                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 632                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 633                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 634                         unix_state_unlock(skpair);
 635                         skpair->sk_state_change(skpair);
 636                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 637                 }
 638
 639                 unix_dgram_peer_wake_disconnect(sk, skpair);
 640                 sock_put(skpair); /* It may now die */
 641         }
 642
 643         /* Try to flush out this socket. Throw out buffers at least */
 644
 645         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 646                 if (state == TCP_LISTEN)
 647                         unix_release_sock(skb->sk, 1);
 648                 /* passed fds are erased in the kfree_skb hook        */
 649                 UNIXCB(skb).consumed = skb->len;
 650                 kfree_skb(skb);
 651         }
 652
 653         if (path.dentry)
 654                 path_put(&path);
 655
 656         sock_put(sk);
 657
 658         /* ---- Socket is dead now and most probably destroyed ---- */
 659
 660         /*
 661          * Fixme: BSD difference: In BSD all sockets connected to us get
 662          *        ECONNRESET and we die on the spot. In Linux we behave
 663          *        like files and pipes do and wait for the last
 664          *        dereference.
 665          *
 666          * Can't we simply set sock->err?
 667          *
 668          *        What the above comment does talk about? --ANK(980817)
 669          */
 670
 671         if (unix_tot_inflight)
 672                 unix_gc();              /* Garbage collect fds */
 673 }
 674
 675 static void init_peercred(struct sock *sk)
 676 {
 677         const struct cred *old_cred;
 678         struct pid *old_pid;
 679
 680         spin_lock(&sk->sk_peer_lock);
 681         old_pid = sk->sk_peer_pid;
 682         old_cred = sk->sk_peer_cred;
 683         sk->sk_peer_pid  = get_pid(task_tgid(current));
 684         sk->sk_peer_cred = get_current_cred();
 685         spin_unlock(&sk->sk_peer_lock);
 686
 687         put_pid(old_pid);
 688         put_cred(old_cred);
 689 }
 690
 691 static void copy_peercred(struct sock *sk, struct sock *peersk)
 692 {
 693         const struct cred *old_cred;
 694         struct pid *old_pid;
 695
 696         if (sk < peersk) {
 697                 spin_lock(&sk->sk_peer_lock);
 698                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 699         } else {
 700                 spin_lock(&peersk->sk_peer_lock);
 701                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 702         }
 703         old_pid = sk->sk_peer_pid;
 704         old_cred = sk->sk_peer_cred;
 705         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 706         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 707
 708         spin_unlock(&sk->sk_peer_lock);
 709         spin_unlock(&peersk->sk_peer_lock);
 710
 711         put_pid(old_pid);
 712         put_cred(old_cred);
 713 }
 714
 715 static int unix_listen(struct socket *sock, int backlog)
 716 {
 717         int err;
 718         struct sock *sk = sock->sk;
 719         struct unix_sock *u = unix_sk(sk);
 720
 721         err = -EOPNOTSUPP;
 722         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 723                 goto out;       /* Only stream/seqpacket sockets accept */
 724         err = -EINVAL;
 725         if (!u->addr)
 726                 goto out;       /* No listens on an unbound socket */
 727         unix_state_lock(sk);
 728         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 729                 goto out_unlock;
 730         if (backlog > sk->sk_max_ack_backlog)
 731                 wake_up_interruptible_all(&u->peer_wait);
 732         sk->sk_max_ack_backlog  = backlog;
 733         sk->sk_state            = TCP_LISTEN;
 734         /* set credentials so connect can copy them */
 735         init_peercred(sk);
 736         err = 0;
 737
 738 out_unlock:
 739         unix_state_unlock(sk);
 740 out:
 741         return err;
 742 }
 743
 744 static int unix_release(struct socket *);
 745 static int unix_bind(struct socket *, struct sockaddr *, int);
 746 static int unix_stream_connect(struct socket *, struct sockaddr *,
 747                                int addr_len, int flags);
 748 static int unix_socketpair(struct socket *, struct socket *);
 749 static int unix_accept(struct socket *, struct socket *, int, bool);
 750 static int unix_getname(struct socket *, struct sockaddr *, int);
 751 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 752 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 753                                     poll_table *);
 754 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 755 #ifdef CONFIG_COMPAT
 756 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 757 #endif
 758 static int unix_shutdown(struct socket *, int);
 759 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 760 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 761 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 762                                        struct pipe_inode_info *, size_t size,
 763                                        unsigned int flags);
 764 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 765 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 766 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 767 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 768 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 769                               int, int);
 770 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 771 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 772                                   int);
 773
 774 static int unix_set_peek_off(struct sock *sk, int val)
 775 {
 776         struct unix_sock *u = unix_sk(sk);
 777
 778         if (mutex_lock_interruptible(&u->iolock))
 779                 return -EINTR;
 780
 781         sk->sk_peek_off = val;
 782         mutex_unlock(&u->iolock);
 783
 784         return 0;
 785 }
 786
 787 #ifdef CONFIG_PROC_FS
 788 static int unix_count_nr_fds(struct sock *sk)
 789 {
 790         struct sk_buff *skb;
 791         struct unix_sock *u;
 792         int nr_fds = 0;
 793
 794         spin_lock(&sk->sk_receive_queue.lock);
 795         skb = skb_peek(&sk->sk_receive_queue);
 796         while (skb) {
 797                 u = unix_sk(skb->sk);
 798                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 799                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 800         }
 801         spin_unlock(&sk->sk_receive_queue.lock);
 802
 803         return nr_fds;
 804 }
 805
 806 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 807 {
 808         struct sock *sk = sock->sk;
 809         unsigned char s_state;
 810         struct unix_sock *u;
 811         int nr_fds = 0;
 812
 813         if (sk) {
 814                 s_state = READ_ONCE(sk->sk_state);
 815                 u = unix_sk(sk);
 816
 817                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 818                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 819                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 820                  */
 821                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 822                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 823                 else if (s_state == TCP_LISTEN)
 824                         nr_fds = unix_count_nr_fds(sk);
 825
 826                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 827         }
 828 }
 829 #else
 830 #define unix_show_fdinfo NULL
 831 #endif
 832
 833 static const struct proto_ops unix_stream_ops = {
 834         .family =       PF_UNIX,
 835         .owner =        THIS_MODULE,
 836         .release =      unix_release,
 837         .bind =         unix_bind,
 838         .connect =      unix_stream_connect,
 839         .socketpair =   unix_socketpair,
 840         .accept =       unix_accept,
 841         .getname =      unix_getname,
 842         .poll =         unix_poll,
 843         .ioctl =        unix_ioctl,
 844 #ifdef CONFIG_COMPAT
 845         .compat_ioctl = unix_compat_ioctl,
 846 #endif
 847         .listen =       unix_listen,
 848         .shutdown =     unix_shutdown,
 849         .sendmsg =      unix_stream_sendmsg,
 850         .recvmsg =      unix_stream_recvmsg,
 851         .read_skb =     unix_stream_read_skb,
 852         .mmap =         sock_no_mmap,
 853         .splice_read =  unix_stream_splice_read,
 854         .set_peek_off = unix_set_peek_off,
 855         .show_fdinfo =  unix_show_fdinfo,
 856 };
 857
 858 static const struct proto_ops unix_dgram_ops = {
 859         .family =       PF_UNIX,
 860         .owner =        THIS_MODULE,
 861         .release =      unix_release,
 862         .bind =         unix_bind,
 863         .connect =      unix_dgram_connect,
 864         .socketpair =   unix_socketpair,
 865         .accept =       sock_no_accept,
 866         .getname =      unix_getname,
 867         .poll =         unix_dgram_poll,
 868         .ioctl =        unix_ioctl,
 869 #ifdef CONFIG_COMPAT
 870         .compat_ioctl = unix_compat_ioctl,
 871 #endif
 872         .listen =       sock_no_listen,
 873         .shutdown =     unix_shutdown,
 874         .sendmsg =      unix_dgram_sendmsg,
 875         .read_skb =     unix_read_skb,
 876         .recvmsg =      unix_dgram_recvmsg,
 877         .mmap =         sock_no_mmap,
 878         .set_peek_off = unix_set_peek_off,
 879         .show_fdinfo =  unix_show_fdinfo,
 880 };
 881
 882 static const struct proto_ops unix_seqpacket_ops = {
 883         .family =       PF_UNIX,
 884         .owner =        THIS_MODULE,
 885         .release =      unix_release,
 886         .bind =         unix_bind,
 887         .connect =      unix_stream_connect,
 888         .socketpair =   unix_socketpair,
 889         .accept =       unix_accept,
 890         .getname =      unix_getname,
 891         .poll =         unix_dgram_poll,
 892         .ioctl =        unix_ioctl,
 893 #ifdef CONFIG_COMPAT
 894         .compat_ioctl = unix_compat_ioctl,
 895 #endif
 896         .listen =       unix_listen,
 897         .shutdown =     unix_shutdown,
 898         .sendmsg =      unix_seqpacket_sendmsg,
 899         .recvmsg =      unix_seqpacket_recvmsg,
 900         .mmap =         sock_no_mmap,
 901         .set_peek_off = unix_set_peek_off,
 902         .show_fdinfo =  unix_show_fdinfo,
 903 };
 904
 905 static void unix_close(struct sock *sk, long timeout)
 906 {
 907         /* Nothing to do here, unix socket does not need a ->close().
 908          * This is merely for sockmap.
 909          */
 910 }
 911
 912 static void unix_unhash(struct sock *sk)
 913 {
 914         /* Nothing to do here, unix socket does not need a ->unhash().
 915          * This is merely for sockmap.
 916          */
 917 }
 918
 919 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 920 {
 921         if (level == SOL_SOCKET) {
 922                 switch (optname) {
 923                 case SO_PEERPIDFD:
 924                         return true;
 925                 default:
 926                         return false;
 927                 }
 928         }
 929
 930         return false;
 931 }
 932
 933 struct proto unix_dgram_proto = {
 934         .name                   = "UNIX",
 935         .owner                  = THIS_MODULE,
 936         .obj_size               = sizeof(struct unix_sock),
 937         .close                  = unix_close,
 938         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 939 #ifdef CONFIG_BPF_SYSCALL
 940         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 941 #endif
 942 };
 943
 944 struct proto unix_stream_proto = {
 945         .name                   = "UNIX-STREAM",
 946         .owner                  = THIS_MODULE,
 947         .obj_size               = sizeof(struct unix_sock),
 948         .close                  = unix_close,
 949         .unhash                 = unix_unhash,
 950         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 951 #ifdef CONFIG_BPF_SYSCALL
 952         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 953 #endif
 954 };
 955
 956 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 957 {
 958         struct unix_sock *u;
 959         struct sock *sk;
 960         int err;
 961
 962         atomic_long_inc(&unix_nr_socks);
 963         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 964                 err = -ENFILE;
 965                 goto err;
 966         }
 967
 968         if (type == SOCK_STREAM)
 969                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 970         else /*dgram and  seqpacket */
 971                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 972
 973         if (!sk) {
 974                 err = -ENOMEM;
 975                 goto err;
 976         }
 977
 978         sock_init_data(sock, sk);
 979
 980         sk->sk_hash             = unix_unbound_hash(sk);
 981         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 982         sk->sk_write_space      = unix_write_space;
 983         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 984         sk->sk_destruct         = unix_sock_destructor;
 985         u         = unix_sk(sk);
 986         u->path.dentry = NULL;
 987         u->path.mnt = NULL;
 988         spin_lock_init(&u->lock);
 989         atomic_long_set(&u->inflight, 0);
 990         INIT_LIST_HEAD(&u->link);
 991         mutex_init(&u->iolock); /* single task reading lock */
 992         mutex_init(&u->bindlock); /* single task binding lock */
 993         init_waitqueue_head(&u->peer_wait);
 994         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 995         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
 996         unix_insert_unbound_socket(net, sk);
 997
 998         sock_prot_inuse_add(net, sk->sk_prot, 1);
 999
1000         return sk;
1001
1002 err:
1003         atomic_long_dec(&unix_nr_socks);
1004         return ERR_PTR(err);
1005 }
1006
1007 static int unix_create(struct net *net, struct socket *sock, int protocol,
1008                        int kern)
1009 {
1010         struct sock *sk;
1011
1012         if (protocol && protocol != PF_UNIX)
1013                 return -EPROTONOSUPPORT;
1014
1015         sock->state = SS_UNCONNECTED;
1016
1017         switch (sock->type) {
1018         case SOCK_STREAM:
1019                 sock->ops = &unix_stream_ops;
1020                 break;
1021                 /*
1022                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1023                  *      nothing uses it.
1024                  */
1025         case SOCK_RAW:
1026                 sock->type = SOCK_DGRAM;
1027                 fallthrough;
1028         case SOCK_DGRAM:
1029                 sock->ops = &unix_dgram_ops;
1030                 break;
1031         case SOCK_SEQPACKET:
1032                 sock->ops = &unix_seqpacket_ops;
1033                 break;
1034         default:
1035                 return -ESOCKTNOSUPPORT;
1036         }
1037
1038         sk = unix_create1(net, sock, kern, sock->type);
1039         if (IS_ERR(sk))
1040                 return PTR_ERR(sk);
1041
1042         return 0;
1043 }
1044
1045 static int unix_release(struct socket *sock)
1046 {
1047         struct sock *sk = sock->sk;
1048
1049         if (!sk)
1050                 return 0;
1051
1052         sk->sk_prot->close(sk, 0);
1053         unix_release_sock(sk, 0);
1054         sock->sk = NULL;
1055
1056         return 0;
1057 }
1058
1059 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1060                                   int type)
1061 {
1062         struct inode *inode;
1063         struct path path;
1064         struct sock *sk;
1065         int err;
1066
1067         unix_mkname_bsd(sunaddr, addr_len);
1068         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1069         if (err)
1070                 goto fail;
1071
1072         err = path_permission(&path, MAY_WRITE);
1073         if (err)
1074                 goto path_put;
1075
1076         err = -ECONNREFUSED;
1077         inode = d_backing_inode(path.dentry);
1078         if (!S_ISSOCK(inode->i_mode))
1079                 goto path_put;
1080
1081         sk = unix_find_socket_byinode(inode);
1082         if (!sk)
1083                 goto path_put;
1084
1085         err = -EPROTOTYPE;
1086         if (sk->sk_type == type)
1087                 touch_atime(&path);
1088         else
1089                 goto sock_put;
1090
1091         path_put(&path);
1092
1093         return sk;
1094
1095 sock_put:
1096         sock_put(sk);
1097 path_put:
1098         path_put(&path);
1099 fail:
1100         return ERR_PTR(err);
1101 }
1102
1103 static struct sock *unix_find_abstract(struct net *net,
1104                                        struct sockaddr_un *sunaddr,
1105                                        int addr_len, int type)
1106 {
1107         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1108         struct dentry *dentry;
1109         struct sock *sk;
1110
1111         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1112         if (!sk)
1113                 return ERR_PTR(-ECONNREFUSED);
1114
1115         dentry = unix_sk(sk)->path.dentry;
1116         if (dentry)
1117                 touch_atime(&unix_sk(sk)->path);
1118
1119         return sk;
1120 }
1121
1122 static struct sock *unix_find_other(struct net *net,
1123                                     struct sockaddr_un *sunaddr,
1124                                     int addr_len, int type)
1125 {
1126         struct sock *sk;
1127
1128         if (sunaddr->sun_path[0])
1129                 sk = unix_find_bsd(sunaddr, addr_len, type);
1130         else
1131                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1132
1133         return sk;
1134 }
1135
1136 static int unix_autobind(struct sock *sk)
1137 {
1138         unsigned int new_hash, old_hash = sk->sk_hash;
1139         struct unix_sock *u = unix_sk(sk);
1140         struct net *net = sock_net(sk);
1141         struct unix_address *addr;
1142         u32 lastnum, ordernum;
1143         int err;
1144
1145         err = mutex_lock_interruptible(&u->bindlock);
1146         if (err)
1147                 return err;
1148
1149         if (u->addr)
1150                 goto out;
1151
1152         err = -ENOMEM;
1153         addr = kzalloc(sizeof(*addr) +
1154                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1155         if (!addr)
1156                 goto out;
1157
1158         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1159         addr->name->sun_family = AF_UNIX;
1160         refcount_set(&addr->refcnt, 1);
1161
1162         ordernum = get_random_u32();
1163         lastnum = ordernum & 0xFFFFF;
1164 retry:
1165         ordernum = (ordernum + 1) & 0xFFFFF;
1166         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1167
1168         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1169         unix_table_double_lock(net, old_hash, new_hash);
1170
1171         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1172                 unix_table_double_unlock(net, old_hash, new_hash);
1173
1174                 /* __unix_find_socket_byname() may take long time if many names
1175                  * are already in use.
1176                  */
1177                 cond_resched();
1178
1179                 if (ordernum == lastnum) {
1180                         /* Give up if all names seems to be in use. */
1181                         err = -ENOSPC;
1182                         unix_release_addr(addr);
1183                         goto out;
1184                 }
1185
1186                 goto retry;
1187         }
1188
1189         __unix_set_addr_hash(net, sk, addr, new_hash);
1190         unix_table_double_unlock(net, old_hash, new_hash);
1191         err = 0;
1192
1193 out:    mutex_unlock(&u->bindlock);
1194         return err;
1195 }
1196
1197 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1198                          int addr_len)
1199 {
1200         umode_t mode = S_IFSOCK |
1201                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1202         unsigned int new_hash, old_hash = sk->sk_hash;
1203         struct unix_sock *u = unix_sk(sk);
1204         struct net *net = sock_net(sk);
1205         struct mnt_idmap *idmap;
1206         struct unix_address *addr;
1207         struct dentry *dentry;
1208         struct path parent;
1209         int err;
1210
1211         unix_mkname_bsd(sunaddr, addr_len);
1212         addr_len = strlen(sunaddr->sun_path) +
1213                 offsetof(struct sockaddr_un, sun_path) + 1;
1214
1215         addr = unix_create_addr(sunaddr, addr_len);
1216         if (!addr)
1217                 return -ENOMEM;
1218
1219         /*
1220          * Get the parent directory, calculate the hash for last
1221          * component.
1222          */
1223         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1224         if (IS_ERR(dentry)) {
1225                 err = PTR_ERR(dentry);
1226                 goto out;
1227         }
1228
1229         /*
1230          * All right, let's create it.
1231          */
1232         idmap = mnt_idmap(parent.mnt);
1233         err = security_path_mknod(&parent, dentry, mode, 0);
1234         if (!err)
1235                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1236         if (err)
1237                 goto out_path;
1238         err = mutex_lock_interruptible(&u->bindlock);
1239         if (err)
1240                 goto out_unlink;
1241         if (u->addr)
1242                 goto out_unlock;
1243
1244         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1245         unix_table_double_lock(net, old_hash, new_hash);
1246         u->path.mnt = mntget(parent.mnt);
1247         u->path.dentry = dget(dentry);
1248         __unix_set_addr_hash(net, sk, addr, new_hash);
1249         unix_table_double_unlock(net, old_hash, new_hash);
1250         unix_insert_bsd_socket(sk);
1251         mutex_unlock(&u->bindlock);
1252         done_path_create(&parent, dentry);
1253         return 0;
1254
1255 out_unlock:
1256         mutex_unlock(&u->bindlock);
1257         err = -EINVAL;
1258 out_unlink:
1259         /* failed after successful mknod?  unlink what we'd created... */
1260         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1261 out_path:
1262         done_path_create(&parent, dentry);
1263 out:
1264         unix_release_addr(addr);
1265         return err == -EEXIST ? -EADDRINUSE : err;
1266 }
1267
1268 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1269                               int addr_len)
1270 {
1271         unsigned int new_hash, old_hash = sk->sk_hash;
1272         struct unix_sock *u = unix_sk(sk);
1273         struct net *net = sock_net(sk);
1274         struct unix_address *addr;
1275         int err;
1276
1277         addr = unix_create_addr(sunaddr, addr_len);
1278         if (!addr)
1279                 return -ENOMEM;
1280
1281         err = mutex_lock_interruptible(&u->bindlock);
1282         if (err)
1283                 goto out;
1284
1285         if (u->addr) {
1286                 err = -EINVAL;
1287                 goto out_mutex;
1288         }
1289
1290         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1291         unix_table_double_lock(net, old_hash, new_hash);
1292
1293         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1294                 goto out_spin;
1295
1296         __unix_set_addr_hash(net, sk, addr, new_hash);
1297         unix_table_double_unlock(net, old_hash, new_hash);
1298         mutex_unlock(&u->bindlock);
1299         return 0;
1300
1301 out_spin:
1302         unix_table_double_unlock(net, old_hash, new_hash);
1303         err = -EADDRINUSE;
1304 out_mutex:
1305         mutex_unlock(&u->bindlock);
1306 out:
1307         unix_release_addr(addr);
1308         return err;
1309 }
1310
1311 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1312 {
1313         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1314         struct sock *sk = sock->sk;
1315         int err;
1316
1317         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1318             sunaddr->sun_family == AF_UNIX)
1319                 return unix_autobind(sk);
1320
1321         err = unix_validate_addr(sunaddr, addr_len);
1322         if (err)
1323                 return err;
1324
1325         if (sunaddr->sun_path[0])
1326                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1327         else
1328                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1329
1330         return err;
1331 }
1332
1333 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1334 {
1335         if (unlikely(sk1 == sk2) || !sk2) {
1336                 unix_state_lock(sk1);
1337                 return;
1338         }
1339         if (sk1 < sk2) {
1340                 unix_state_lock(sk1);
1341                 unix_state_lock_nested(sk2);
1342         } else {
1343                 unix_state_lock(sk2);
1344                 unix_state_lock_nested(sk1);
1345         }
1346 }
1347
1348 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1349 {
1350         if (unlikely(sk1 == sk2) || !sk2) {
1351                 unix_state_unlock(sk1);
1352                 return;
1353         }
1354         unix_state_unlock(sk1);
1355         unix_state_unlock(sk2);
1356 }
1357
1358 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1359                               int alen, int flags)
1360 {
1361         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1362         struct sock *sk = sock->sk;
1363         struct sock *other;
1364         int err;
1365
1366         err = -EINVAL;
1367         if (alen < offsetofend(struct sockaddr, sa_family))
1368                 goto out;
1369
1370         if (addr->sa_family != AF_UNSPEC) {
1371                 err = unix_validate_addr(sunaddr, alen);
1372                 if (err)
1373                         goto out;
1374
1375                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1376                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1377                     !unix_sk(sk)->addr) {
1378                         err = unix_autobind(sk);
1379                         if (err)
1380                                 goto out;
1381                 }
1382
1383 restart:
1384                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1385                 if (IS_ERR(other)) {
1386                         err = PTR_ERR(other);
1387                         goto out;
1388                 }
1389
1390                 unix_state_double_lock(sk, other);
1391
1392                 /* Apparently VFS overslept socket death. Retry. */
1393                 if (sock_flag(other, SOCK_DEAD)) {
1394                         unix_state_double_unlock(sk, other);
1395                         sock_put(other);
1396                         goto restart;
1397                 }
1398
1399                 err = -EPERM;
1400                 if (!unix_may_send(sk, other))
1401                         goto out_unlock;
1402
1403                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1404                 if (err)
1405                         goto out_unlock;
1406
1407                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1408         } else {
1409                 /*
1410                  *      1003.1g breaking connected state with AF_UNSPEC
1411                  */
1412                 other = NULL;
1413                 unix_state_double_lock(sk, other);
1414         }
1415
1416         /*
1417          * If it was connected, reconnect.
1418          */
1419         if (unix_peer(sk)) {
1420                 struct sock *old_peer = unix_peer(sk);
1421
1422                 unix_peer(sk) = other;
1423                 if (!other)
1424                         sk->sk_state = TCP_CLOSE;
1425                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1426
1427                 unix_state_double_unlock(sk, other);
1428
1429                 if (other != old_peer)
1430                         unix_dgram_disconnected(sk, old_peer);
1431                 sock_put(old_peer);
1432         } else {
1433                 unix_peer(sk) = other;
1434                 unix_state_double_unlock(sk, other);
1435         }
1436
1437         return 0;
1438
1439 out_unlock:
1440         unix_state_double_unlock(sk, other);
1441         sock_put(other);
1442 out:
1443         return err;
1444 }
1445
1446 static long unix_wait_for_peer(struct sock *other, long timeo)
1447         __releases(&unix_sk(other)->lock)
1448 {
1449         struct unix_sock *u = unix_sk(other);
1450         int sched;
1451         DEFINE_WAIT(wait);
1452
1453         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1454
1455         sched = !sock_flag(other, SOCK_DEAD) &&
1456                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1457                 unix_recvq_full_lockless(other);
1458
1459         unix_state_unlock(other);
1460
1461         if (sched)
1462                 timeo = schedule_timeout(timeo);
1463
1464         finish_wait(&u->peer_wait, &wait);
1465         return timeo;
1466 }
1467
1468 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1469                                int addr_len, int flags)
1470 {
1471         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1472         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1473         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1474         struct net *net = sock_net(sk);
1475         struct sk_buff *skb = NULL;
1476         long timeo;
1477         int err;
1478         int st;
1479
1480         err = unix_validate_addr(sunaddr, addr_len);
1481         if (err)
1482                 goto out;
1483
1484         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1485              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1486                 err = unix_autobind(sk);
1487                 if (err)
1488                         goto out;
1489         }
1490
1491         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1492
1493         /* First of all allocate resources.
1494            If we will make it after state is locked,
1495            we will have to recheck all again in any case.
1496          */
1497
1498         /* create new sock for complete connection */
1499         newsk = unix_create1(net, NULL, 0, sock->type);
1500         if (IS_ERR(newsk)) {
1501                 err = PTR_ERR(newsk);
1502                 newsk = NULL;
1503                 goto out;
1504         }
1505
1506         err = -ENOMEM;
1507
1508         /* Allocate skb for sending to listening sock */
1509         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1510         if (skb == NULL)
1511                 goto out;
1512
1513 restart:
1514         /*  Find listening sock. */
1515         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1516         if (IS_ERR(other)) {
1517                 err = PTR_ERR(other);
1518                 other = NULL;
1519                 goto out;
1520         }
1521
1522         /* Latch state of peer */
1523         unix_state_lock(other);
1524
1525         /* Apparently VFS overslept socket death. Retry. */
1526         if (sock_flag(other, SOCK_DEAD)) {
1527                 unix_state_unlock(other);
1528                 sock_put(other);
1529                 goto restart;
1530         }
1531
1532         err = -ECONNREFUSED;
1533         if (other->sk_state != TCP_LISTEN)
1534                 goto out_unlock;
1535         if (other->sk_shutdown & RCV_SHUTDOWN)
1536                 goto out_unlock;
1537
1538         if (unix_recvq_full(other)) {
1539                 err = -EAGAIN;
1540                 if (!timeo)
1541                         goto out_unlock;
1542
1543                 timeo = unix_wait_for_peer(other, timeo);
1544
1545                 err = sock_intr_errno(timeo);
1546                 if (signal_pending(current))
1547                         goto out;
1548                 sock_put(other);
1549                 goto restart;
1550         }
1551
1552         /* Latch our state.
1553
1554            It is tricky place. We need to grab our state lock and cannot
1555            drop lock on peer. It is dangerous because deadlock is
1556            possible. Connect to self case and simultaneous
1557            attempt to connect are eliminated by checking socket
1558            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1559            check this before attempt to grab lock.
1560
1561            Well, and we have to recheck the state after socket locked.
1562          */
1563         st = sk->sk_state;
1564
1565         switch (st) {
1566         case TCP_CLOSE:
1567                 /* This is ok... continue with connect */
1568                 break;
1569         case TCP_ESTABLISHED:
1570                 /* Socket is already connected */
1571                 err = -EISCONN;
1572                 goto out_unlock;
1573         default:
1574                 err = -EINVAL;
1575                 goto out_unlock;
1576         }
1577
1578         unix_state_lock_nested(sk);
1579
1580         if (sk->sk_state != st) {
1581                 unix_state_unlock(sk);
1582                 unix_state_unlock(other);
1583                 sock_put(other);
1584                 goto restart;
1585         }
1586
1587         err = security_unix_stream_connect(sk, other, newsk);
1588         if (err) {
1589                 unix_state_unlock(sk);
1590                 goto out_unlock;
1591         }
1592
1593         /* The way is open! Fastly set all the necessary fields... */
1594
1595         sock_hold(sk);
1596         unix_peer(newsk)        = sk;
1597         newsk->sk_state         = TCP_ESTABLISHED;
1598         newsk->sk_type          = sk->sk_type;
1599         init_peercred(newsk);
1600         newu = unix_sk(newsk);
1601         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1602         otheru = unix_sk(other);
1603
1604         /* copy address information from listening to new sock
1605          *
1606          * The contents of *(otheru->addr) and otheru->path
1607          * are seen fully set up here, since we have found
1608          * otheru in hash under its lock.  Insertion into the
1609          * hash chain we'd found it in had been done in an
1610          * earlier critical area protected by the chain's lock,
1611          * the same one where we'd set *(otheru->addr) contents,
1612          * as well as otheru->path and otheru->addr itself.
1613          *
1614          * Using smp_store_release() here to set newu->addr
1615          * is enough to make those stores, as well as stores
1616          * to newu->path visible to anyone who gets newu->addr
1617          * by smp_load_acquire().  IOW, the same warranties
1618          * as for unix_sock instances bound in unix_bind() or
1619          * in unix_autobind().
1620          */
1621         if (otheru->path.dentry) {
1622                 path_get(&otheru->path);
1623                 newu->path = otheru->path;
1624         }
1625         refcount_inc(&otheru->addr->refcnt);
1626         smp_store_release(&newu->addr, otheru->addr);
1627
1628         /* Set credentials */
1629         copy_peercred(sk, other);
1630
1631         sock->state     = SS_CONNECTED;
1632         sk->sk_state    = TCP_ESTABLISHED;
1633         sock_hold(newsk);
1634
1635         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1636         unix_peer(sk)   = newsk;
1637
1638         unix_state_unlock(sk);
1639
1640         /* take ten and send info to listening sock */
1641         spin_lock(&other->sk_receive_queue.lock);
1642         __skb_queue_tail(&other->sk_receive_queue, skb);
1643         spin_unlock(&other->sk_receive_queue.lock);
1644         unix_state_unlock(other);
1645         other->sk_data_ready(other);
1646         sock_put(other);
1647         return 0;
1648
1649 out_unlock:
1650         if (other)
1651                 unix_state_unlock(other);
1652
1653 out:
1654         kfree_skb(skb);
1655         if (newsk)
1656                 unix_release_sock(newsk, 0);
1657         if (other)
1658                 sock_put(other);
1659         return err;
1660 }
1661
1662 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1663 {
1664         struct sock *ska = socka->sk, *skb = sockb->sk;
1665
1666         /* Join our sockets back to back */
1667         sock_hold(ska);
1668         sock_hold(skb);
1669         unix_peer(ska) = skb;
1670         unix_peer(skb) = ska;
1671         init_peercred(ska);
1672         init_peercred(skb);
1673
1674         ska->sk_state = TCP_ESTABLISHED;
1675         skb->sk_state = TCP_ESTABLISHED;
1676         socka->state  = SS_CONNECTED;
1677         sockb->state  = SS_CONNECTED;
1678         return 0;
1679 }
1680
1681 static void unix_sock_inherit_flags(const struct socket *old,
1682                                     struct socket *new)
1683 {
1684         if (test_bit(SOCK_PASSCRED, &old->flags))
1685                 set_bit(SOCK_PASSCRED, &new->flags);
1686         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1687                 set_bit(SOCK_PASSPIDFD, &new->flags);
1688         if (test_bit(SOCK_PASSSEC, &old->flags))
1689                 set_bit(SOCK_PASSSEC, &new->flags);
1690 }
1691
1692 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1693                        bool kern)
1694 {
1695         struct sock *sk = sock->sk;
1696         struct sock *tsk;
1697         struct sk_buff *skb;
1698         int err;
1699
1700         err = -EOPNOTSUPP;
1701         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1702                 goto out;
1703
1704         err = -EINVAL;
1705         if (sk->sk_state != TCP_LISTEN)
1706                 goto out;
1707
1708         /* If socket state is TCP_LISTEN it cannot change (for now...),
1709          * so that no locks are necessary.
1710          */
1711
1712         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1713                                 &err);
1714         if (!skb) {
1715                 /* This means receive shutdown. */
1716                 if (err == 0)
1717                         err = -EINVAL;
1718                 goto out;
1719         }
1720
1721         tsk = skb->sk;
1722         skb_free_datagram(sk, skb);
1723         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1724
1725         /* attach accepted sock to socket */
1726         unix_state_lock(tsk);
1727         newsock->state = SS_CONNECTED;
1728         unix_sock_inherit_flags(sock, newsock);
1729         sock_graft(tsk, newsock);
1730         unix_state_unlock(tsk);
1731         return 0;
1732
1733 out:
1734         return err;
1735 }
1736
1737
1738 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1739 {
1740         struct sock *sk = sock->sk;
1741         struct unix_address *addr;
1742         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1743         int err = 0;
1744
1745         if (peer) {
1746                 sk = unix_peer_get(sk);
1747
1748                 err = -ENOTCONN;
1749                 if (!sk)
1750                         goto out;
1751                 err = 0;
1752         } else {
1753                 sock_hold(sk);
1754         }
1755
1756         addr = smp_load_acquire(&unix_sk(sk)->addr);
1757         if (!addr) {
1758                 sunaddr->sun_family = AF_UNIX;
1759                 sunaddr->sun_path[0] = 0;
1760                 err = offsetof(struct sockaddr_un, sun_path);
1761         } else {
1762                 err = addr->len;
1763                 memcpy(sunaddr, addr->name, addr->len);
1764         }
1765         sock_put(sk);
1766 out:
1767         return err;
1768 }
1769
1770 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1771 {
1772         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1773
1774         /*
1775          * Garbage collection of unix sockets starts by selecting a set of
1776          * candidate sockets which have reference only from being in flight
1777          * (total_refs == inflight_refs).  This condition is checked once during
1778          * the candidate collection phase, and candidates are marked as such, so
1779          * that non-candidates can later be ignored.  While inflight_refs is
1780          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1781          * is an instantaneous decision.
1782          *
1783          * Once a candidate, however, the socket must not be reinstalled into a
1784          * file descriptor while the garbage collection is in progress.
1785          *
1786          * If the above conditions are met, then the directed graph of
1787          * candidates (*) does not change while unix_gc_lock is held.
1788          *
1789          * Any operations that changes the file count through file descriptors
1790          * (dup, close, sendmsg) does not change the graph since candidates are
1791          * not installed in fds.
1792          *
1793          * Dequeing a candidate via recvmsg would install it into an fd, but
1794          * that takes unix_gc_lock to decrement the inflight count, so it's
1795          * serialized with garbage collection.
1796          *
1797          * MSG_PEEK is special in that it does not change the inflight count,
1798          * yet does install the socket into an fd.  The following lock/unlock
1799          * pair is to ensure serialization with garbage collection.  It must be
1800          * done between incrementing the file count and installing the file into
1801          * an fd.
1802          *
1803          * If garbage collection starts after the barrier provided by the
1804          * lock/unlock, then it will see the elevated refcount and not mark this
1805          * as a candidate.  If a garbage collection is already in progress
1806          * before the file count was incremented, then the lock/unlock pair will
1807          * ensure that garbage collection is finished before progressing to
1808          * installing the fd.
1809          *
1810          * (*) A -> B where B is on the queue of A or B is on the queue of C
1811          * which is on the queue of listening socket A.
1812          */
1813         spin_lock(&unix_gc_lock);
1814         spin_unlock(&unix_gc_lock);
1815 }
1816
1817 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1818 {
1819         int err = 0;
1820
1821         UNIXCB(skb).pid  = get_pid(scm->pid);
1822         UNIXCB(skb).uid = scm->creds.uid;
1823         UNIXCB(skb).gid = scm->creds.gid;
1824         UNIXCB(skb).fp = NULL;
1825         unix_get_secdata(scm, skb);
1826         if (scm->fp && send_fds)
1827                 err = unix_attach_fds(scm, skb);
1828
1829         skb->destructor = unix_destruct_scm;
1830         return err;
1831 }
1832
1833 static bool unix_passcred_enabled(const struct socket *sock,
1834                                   const struct sock *other)
1835 {
1836         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1837                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1838                !other->sk_socket ||
1839                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1840                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1841 }
1842
1843 /*
1844  * Some apps rely on write() giving SCM_CREDENTIALS
1845  * We include credentials if source or destination socket
1846  * asserted SOCK_PASSCRED.
1847  */
1848 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1849                             const struct sock *other)
1850 {
1851         if (UNIXCB(skb).pid)
1852                 return;
1853         if (unix_passcred_enabled(sock, other)) {
1854                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1855                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1856         }
1857 }
1858
1859 static bool unix_skb_scm_eq(struct sk_buff *skb,
1860                             struct scm_cookie *scm)
1861 {
1862         return UNIXCB(skb).pid == scm->pid &&
1863                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1864                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1865                unix_secdata_eq(scm, skb);
1866 }
1867
1868 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1869 {
1870         struct scm_fp_list *fp = UNIXCB(skb).fp;
1871         struct unix_sock *u = unix_sk(sk);
1872
1873         if (unlikely(fp && fp->count))
1874                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1875 }
1876
1877 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1878 {
1879         struct scm_fp_list *fp = UNIXCB(skb).fp;
1880         struct unix_sock *u = unix_sk(sk);
1881
1882         if (unlikely(fp && fp->count))
1883                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1884 }
1885
1886 /*
1887  *      Send AF_UNIX data.
1888  */
1889
1890 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1891                               size_t len)
1892 {
1893         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1894         struct sock *sk = sock->sk, *other = NULL;
1895         struct unix_sock *u = unix_sk(sk);
1896         struct scm_cookie scm;
1897         struct sk_buff *skb;
1898         int data_len = 0;
1899         int sk_locked;
1900         long timeo;
1901         int err;
1902
1903         wait_for_unix_gc();
1904         err = scm_send(sock, msg, &scm, false);
1905         if (err < 0)
1906                 return err;
1907
1908         err = -EOPNOTSUPP;
1909         if (msg->msg_flags&MSG_OOB)
1910                 goto out;
1911
1912         if (msg->msg_namelen) {
1913                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1914                 if (err)
1915                         goto out;
1916         } else {
1917                 sunaddr = NULL;
1918                 err = -ENOTCONN;
1919                 other = unix_peer_get(sk);
1920                 if (!other)
1921                         goto out;
1922         }
1923
1924         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1925              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1926                 err = unix_autobind(sk);
1927                 if (err)
1928                         goto out;
1929         }
1930
1931         err = -EMSGSIZE;
1932         if (len > sk->sk_sndbuf - 32)
1933                 goto out;
1934
1935         if (len > SKB_MAX_ALLOC) {
1936                 data_len = min_t(size_t,
1937                                  len - SKB_MAX_ALLOC,
1938                                  MAX_SKB_FRAGS * PAGE_SIZE);
1939                 data_len = PAGE_ALIGN(data_len);
1940
1941                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1942         }
1943
1944         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1945                                    msg->msg_flags & MSG_DONTWAIT, &err,
1946                                    PAGE_ALLOC_COSTLY_ORDER);
1947         if (skb == NULL)
1948                 goto out;
1949
1950         err = unix_scm_to_skb(&scm, skb, true);
1951         if (err < 0)
1952                 goto out_free;
1953
1954         skb_put(skb, len - data_len);
1955         skb->data_len = data_len;
1956         skb->len = len;
1957         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1958         if (err)
1959                 goto out_free;
1960
1961         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1962
1963 restart:
1964         if (!other) {
1965                 err = -ECONNRESET;
1966                 if (sunaddr == NULL)
1967                         goto out_free;
1968
1969                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1970                                         sk->sk_type);
1971                 if (IS_ERR(other)) {
1972                         err = PTR_ERR(other);
1973                         other = NULL;
1974                         goto out_free;
1975                 }
1976         }
1977
1978         if (sk_filter(other, skb) < 0) {
1979                 /* Toss the packet but do not return any error to the sender */
1980                 err = len;
1981                 goto out_free;
1982         }
1983
1984         sk_locked = 0;
1985         unix_state_lock(other);
1986 restart_locked:
1987         err = -EPERM;
1988         if (!unix_may_send(sk, other))
1989                 goto out_unlock;
1990
1991         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1992                 /*
1993                  *      Check with 1003.1g - what should
1994                  *      datagram error
1995                  */
1996                 unix_state_unlock(other);
1997                 sock_put(other);
1998
1999                 if (!sk_locked)
2000                         unix_state_lock(sk);
2001
2002                 err = 0;
2003                 if (sk->sk_type == SOCK_SEQPACKET) {
2004                         /* We are here only when racing with unix_release_sock()
2005                          * is clearing @other. Never change state to TCP_CLOSE
2006                          * unlike SOCK_DGRAM wants.
2007                          */
2008                         unix_state_unlock(sk);
2009                         err = -EPIPE;
2010                 } else if (unix_peer(sk) == other) {
2011                         unix_peer(sk) = NULL;
2012                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2013
2014                         sk->sk_state = TCP_CLOSE;
2015                         unix_state_unlock(sk);
2016
2017                         unix_dgram_disconnected(sk, other);
2018                         sock_put(other);
2019                         err = -ECONNREFUSED;
2020                 } else {
2021                         unix_state_unlock(sk);
2022                 }
2023
2024                 other = NULL;
2025                 if (err)
2026                         goto out_free;
2027                 goto restart;
2028         }
2029
2030         err = -EPIPE;
2031         if (other->sk_shutdown & RCV_SHUTDOWN)
2032                 goto out_unlock;
2033
2034         if (sk->sk_type != SOCK_SEQPACKET) {
2035                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2036                 if (err)
2037                         goto out_unlock;
2038         }
2039
2040         /* other == sk && unix_peer(other) != sk if
2041          * - unix_peer(sk) == NULL, destination address bound to sk
2042          * - unix_peer(sk) == sk by time of get but disconnected before lock
2043          */
2044         if (other != sk &&
2045             unlikely(unix_peer(other) != sk &&
2046             unix_recvq_full_lockless(other))) {
2047                 if (timeo) {
2048                         timeo = unix_wait_for_peer(other, timeo);
2049
2050                         err = sock_intr_errno(timeo);
2051                         if (signal_pending(current))
2052                                 goto out_free;
2053
2054                         goto restart;
2055                 }
2056
2057                 if (!sk_locked) {
2058                         unix_state_unlock(other);
2059                         unix_state_double_lock(sk, other);
2060                 }
2061
2062                 if (unix_peer(sk) != other ||
2063                     unix_dgram_peer_wake_me(sk, other)) {
2064                         err = -EAGAIN;
2065                         sk_locked = 1;
2066                         goto out_unlock;
2067                 }
2068
2069                 if (!sk_locked) {
2070                         sk_locked = 1;
2071                         goto restart_locked;
2072                 }
2073         }
2074
2075         if (unlikely(sk_locked))
2076                 unix_state_unlock(sk);
2077
2078         if (sock_flag(other, SOCK_RCVTSTAMP))
2079                 __net_timestamp(skb);
2080         maybe_add_creds(skb, sock, other);
2081         scm_stat_add(other, skb);
2082         skb_queue_tail(&other->sk_receive_queue, skb);
2083         unix_state_unlock(other);
2084         other->sk_data_ready(other);
2085         sock_put(other);
2086         scm_destroy(&scm);
2087         return len;
2088
2089 out_unlock:
2090         if (sk_locked)
2091                 unix_state_unlock(sk);
2092         unix_state_unlock(other);
2093 out_free:
2094         kfree_skb(skb);
2095 out:
2096         if (other)
2097                 sock_put(other);
2098         scm_destroy(&scm);
2099         return err;
2100 }
2101
2102 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2103  * bytes, and a minimum of a full page.
2104  */
2105 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2106
2107 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2108 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2109                      struct scm_cookie *scm, bool fds_sent)
2110 {
2111         struct unix_sock *ousk = unix_sk(other);
2112         struct sk_buff *skb;
2113         int err = 0;
2114
2115         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2116
2117         if (!skb)
2118                 return err;
2119
2120         err = unix_scm_to_skb(scm, skb, !fds_sent);
2121         if (err < 0) {
2122                 kfree_skb(skb);
2123                 return err;
2124         }
2125         skb_put(skb, 1);
2126         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2127
2128         if (err) {
2129                 kfree_skb(skb);
2130                 return err;
2131         }
2132
2133         unix_state_lock(other);
2134
2135         if (sock_flag(other, SOCK_DEAD) ||
2136             (other->sk_shutdown & RCV_SHUTDOWN)) {
2137                 unix_state_unlock(other);
2138                 kfree_skb(skb);
2139                 return -EPIPE;
2140         }
2141
2142         maybe_add_creds(skb, sock, other);
2143         skb_get(skb);
2144
2145         if (ousk->oob_skb)
2146                 consume_skb(ousk->oob_skb);
2147
2148         WRITE_ONCE(ousk->oob_skb, skb);
2149
2150         scm_stat_add(other, skb);
2151         skb_queue_tail(&other->sk_receive_queue, skb);
2152         sk_send_sigurg(other);
2153         unix_state_unlock(other);
2154         other->sk_data_ready(other);
2155
2156         return err;
2157 }
2158 #endif
2159
2160 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2161                                size_t len)
2162 {
2163         struct sock *sk = sock->sk;
2164         struct sock *other = NULL;
2165         int err, size;
2166         struct sk_buff *skb;
2167         int sent = 0;
2168         struct scm_cookie scm;
2169         bool fds_sent = false;
2170         int data_len;
2171
2172         wait_for_unix_gc();
2173         err = scm_send(sock, msg, &scm, false);
2174         if (err < 0)
2175                 return err;
2176
2177         err = -EOPNOTSUPP;
2178         if (msg->msg_flags & MSG_OOB) {
2179 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2180                 if (len)
2181                         len--;
2182                 else
2183 #endif
2184                         goto out_err;
2185         }
2186
2187         if (msg->msg_namelen) {
2188                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2189                 goto out_err;
2190         } else {
2191                 err = -ENOTCONN;
2192                 other = unix_peer(sk);
2193                 if (!other)
2194                         goto out_err;
2195         }
2196
2197         if (sk->sk_shutdown & SEND_SHUTDOWN)
2198                 goto pipe_err;
2199
2200         while (sent < len) {
2201                 size = len - sent;
2202
2203                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2204                         skb = sock_alloc_send_pskb(sk, 0, 0,
2205                                                    msg->msg_flags & MSG_DONTWAIT,
2206                                                    &err, 0);
2207                 } else {
2208                         /* Keep two messages in the pipe so it schedules better */
2209                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2210
2211                         /* allow fallback to order-0 allocations */
2212                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2213
2214                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2215
2216                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2217
2218                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2219                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2220                                                    get_order(UNIX_SKB_FRAGS_SZ));
2221                 }
2222                 if (!skb)
2223                         goto out_err;
2224
2225                 /* Only send the fds in the first buffer */
2226                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2227                 if (err < 0) {
2228                         kfree_skb(skb);
2229                         goto out_err;
2230                 }
2231                 fds_sent = true;
2232
2233                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2234                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2235                                                    sk->sk_allocation);
2236                         if (err < 0) {
2237                                 kfree_skb(skb);
2238                                 goto out_err;
2239                         }
2240                         size = err;
2241                         refcount_add(size, &sk->sk_wmem_alloc);
2242                 } else {
2243                         skb_put(skb, size - data_len);
2244                         skb->data_len = data_len;
2245                         skb->len = size;
2246                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2247                         if (err) {
2248                                 kfree_skb(skb);
2249                                 goto out_err;
2250                         }
2251                 }
2252
2253                 unix_state_lock(other);
2254
2255                 if (sock_flag(other, SOCK_DEAD) ||
2256                     (other->sk_shutdown & RCV_SHUTDOWN))
2257                         goto pipe_err_free;
2258
2259                 maybe_add_creds(skb, sock, other);
2260                 scm_stat_add(other, skb);
2261                 skb_queue_tail(&other->sk_receive_queue, skb);
2262                 unix_state_unlock(other);
2263                 other->sk_data_ready(other);
2264                 sent += size;
2265         }
2266
2267 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2268         if (msg->msg_flags & MSG_OOB) {
2269                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2270                 if (err)
2271                         goto out_err;
2272                 sent++;
2273         }
2274 #endif
2275
2276         scm_destroy(&scm);
2277
2278         return sent;
2279
2280 pipe_err_free:
2281         unix_state_unlock(other);
2282         kfree_skb(skb);
2283 pipe_err:
2284         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2285                 send_sig(SIGPIPE, current, 0);
2286         err = -EPIPE;
2287 out_err:
2288         scm_destroy(&scm);
2289         return sent ? : err;
2290 }
2291
2292 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2293                                   size_t len)
2294 {
2295         int err;
2296         struct sock *sk = sock->sk;
2297
2298         err = sock_error(sk);
2299         if (err)
2300                 return err;
2301
2302         if (sk->sk_state != TCP_ESTABLISHED)
2303                 return -ENOTCONN;
2304
2305         if (msg->msg_namelen)
2306                 msg->msg_namelen = 0;
2307
2308         return unix_dgram_sendmsg(sock, msg, len);
2309 }
2310
2311 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2312                                   size_t size, int flags)
2313 {
2314         struct sock *sk = sock->sk;
2315
2316         if (sk->sk_state != TCP_ESTABLISHED)
2317                 return -ENOTCONN;
2318
2319         return unix_dgram_recvmsg(sock, msg, size, flags);
2320 }
2321
2322 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2323 {
2324         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2325
2326         if (addr) {
2327                 msg->msg_namelen = addr->len;
2328                 memcpy(msg->msg_name, addr->name, addr->len);
2329         }
2330 }
2331
2332 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2333                          int flags)
2334 {
2335         struct scm_cookie scm;
2336         struct socket *sock = sk->sk_socket;
2337         struct unix_sock *u = unix_sk(sk);
2338         struct sk_buff *skb, *last;
2339         long timeo;
2340         int skip;
2341         int err;
2342
2343         err = -EOPNOTSUPP;
2344         if (flags&MSG_OOB)
2345                 goto out;
2346
2347         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2348
2349         do {
2350                 mutex_lock(&u->iolock);
2351
2352                 skip = sk_peek_offset(sk, flags);
2353                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2354                                               &skip, &err, &last);
2355                 if (skb) {
2356                         if (!(flags & MSG_PEEK))
2357                                 scm_stat_del(sk, skb);
2358                         break;
2359                 }
2360
2361                 mutex_unlock(&u->iolock);
2362
2363                 if (err != -EAGAIN)
2364                         break;
2365         } while (timeo &&
2366                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2367                                               &err, &timeo, last));
2368
2369         if (!skb) { /* implies iolock unlocked */
2370                 unix_state_lock(sk);
2371                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2372                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2373                     (sk->sk_shutdown & RCV_SHUTDOWN))
2374                         err = 0;
2375                 unix_state_unlock(sk);
2376                 goto out;
2377         }
2378
2379         if (wq_has_sleeper(&u->peer_wait))
2380                 wake_up_interruptible_sync_poll(&u->peer_wait,
2381                                                 EPOLLOUT | EPOLLWRNORM |
2382                                                 EPOLLWRBAND);
2383
2384         if (msg->msg_name)
2385                 unix_copy_addr(msg, skb->sk);
2386
2387         if (size > skb->len - skip)
2388                 size = skb->len - skip;
2389         else if (size < skb->len - skip)
2390                 msg->msg_flags |= MSG_TRUNC;
2391
2392         err = skb_copy_datagram_msg(skb, skip, msg, size);
2393         if (err)
2394                 goto out_free;
2395
2396         if (sock_flag(sk, SOCK_RCVTSTAMP))
2397                 __sock_recv_timestamp(msg, sk, skb);
2398
2399         memset(&scm, 0, sizeof(scm));
2400
2401         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2402         unix_set_secdata(&scm, skb);
2403
2404         if (!(flags & MSG_PEEK)) {
2405                 if (UNIXCB(skb).fp)
2406                         unix_detach_fds(&scm, skb);
2407
2408                 sk_peek_offset_bwd(sk, skb->len);
2409         } else {
2410                 /* It is questionable: on PEEK we could:
2411                    - do not return fds - good, but too simple 8)
2412                    - return fds, and do not return them on read (old strategy,
2413                      apparently wrong)
2414                    - clone fds (I chose it for now, it is the most universal
2415                      solution)
2416
2417                    POSIX 1003.1g does not actually define this clearly
2418                    at all. POSIX 1003.1g doesn't define a lot of things
2419                    clearly however!
2420
2421                 */
2422
2423                 sk_peek_offset_fwd(sk, size);
2424
2425                 if (UNIXCB(skb).fp)
2426                         unix_peek_fds(&scm, skb);
2427         }
2428         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2429
2430         scm_recv_unix(sock, msg, &scm, flags);
2431
2432 out_free:
2433         skb_free_datagram(sk, skb);
2434         mutex_unlock(&u->iolock);
2435 out:
2436         return err;
2437 }
2438
2439 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2440                               int flags)
2441 {
2442         struct sock *sk = sock->sk;
2443
2444 #ifdef CONFIG_BPF_SYSCALL
2445         const struct proto *prot = READ_ONCE(sk->sk_prot);
2446
2447         if (prot != &unix_dgram_proto)
2448                 return prot->recvmsg(sk, msg, size, flags, NULL);
2449 #endif
2450         return __unix_dgram_recvmsg(sk, msg, size, flags);
2451 }
2452
2453 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2454 {
2455         struct unix_sock *u = unix_sk(sk);
2456         struct sk_buff *skb;
2457         int err;
2458
2459         mutex_lock(&u->iolock);
2460         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2461         mutex_unlock(&u->iolock);
2462         if (!skb)
2463                 return err;
2464
2465         return recv_actor(sk, skb);
2466 }
2467
2468 /*
2469  *      Sleep until more data has arrived. But check for races..
2470  */
2471 static long unix_stream_data_wait(struct sock *sk, long timeo,
2472                                   struct sk_buff *last, unsigned int last_len,
2473                                   bool freezable)
2474 {
2475         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2476         struct sk_buff *tail;
2477         DEFINE_WAIT(wait);
2478
2479         unix_state_lock(sk);
2480
2481         for (;;) {
2482                 prepare_to_wait(sk_sleep(sk), &wait, state);
2483
2484                 tail = skb_peek_tail(&sk->sk_receive_queue);
2485                 if (tail != last ||
2486                     (tail && tail->len != last_len) ||
2487                     sk->sk_err ||
2488                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2489                     signal_pending(current) ||
2490                     !timeo)
2491                         break;
2492
2493                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2494                 unix_state_unlock(sk);
2495                 timeo = schedule_timeout(timeo);
2496                 unix_state_lock(sk);
2497
2498                 if (sock_flag(sk, SOCK_DEAD))
2499                         break;
2500
2501                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2502         }
2503
2504         finish_wait(sk_sleep(sk), &wait);
2505         unix_state_unlock(sk);
2506         return timeo;
2507 }
2508
2509 static unsigned int unix_skb_len(const struct sk_buff *skb)
2510 {
2511         return skb->len - UNIXCB(skb).consumed;
2512 }
2513
2514 struct unix_stream_read_state {
2515         int (*recv_actor)(struct sk_buff *, int, int,
2516                           struct unix_stream_read_state *);
2517         struct socket *socket;
2518         struct msghdr *msg;
2519         struct pipe_inode_info *pipe;
2520         size_t size;
2521         int flags;
2522         unsigned int splice_flags;
2523 };
2524
2525 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2526 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2527 {
2528         struct socket *sock = state->socket;
2529         struct sock *sk = sock->sk;
2530         struct unix_sock *u = unix_sk(sk);
2531         int chunk = 1;
2532         struct sk_buff *oob_skb;
2533
2534         mutex_lock(&u->iolock);
2535         unix_state_lock(sk);
2536
2537         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2538                 unix_state_unlock(sk);
2539                 mutex_unlock(&u->iolock);
2540                 return -EINVAL;
2541         }
2542
2543         oob_skb = u->oob_skb;
2544
2545         if (!(state->flags & MSG_PEEK))
2546                 WRITE_ONCE(u->oob_skb, NULL);
2547
2548         unix_state_unlock(sk);
2549
2550         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2551
2552         if (!(state->flags & MSG_PEEK)) {
2553                 UNIXCB(oob_skb).consumed += 1;
2554                 kfree_skb(oob_skb);
2555         }
2556
2557         mutex_unlock(&u->iolock);
2558
2559         if (chunk < 0)
2560                 return -EFAULT;
2561
2562         state->msg->msg_flags |= MSG_OOB;
2563         return 1;
2564 }
2565
2566 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2567                                   int flags, int copied)
2568 {
2569         struct unix_sock *u = unix_sk(sk);
2570
2571         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2572                 skb_unlink(skb, &sk->sk_receive_queue);
2573                 consume_skb(skb);
2574                 skb = NULL;
2575         } else {
2576                 if (skb == u->oob_skb) {
2577                         if (copied) {
2578                                 skb = NULL;
2579                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2580                                 if (!(flags & MSG_PEEK)) {
2581                                         WRITE_ONCE(u->oob_skb, NULL);
2582                                         consume_skb(skb);
2583                                 }
2584                         } else if (!(flags & MSG_PEEK)) {
2585                                 skb_unlink(skb, &sk->sk_receive_queue);
2586                                 consume_skb(skb);
2587                                 skb = skb_peek(&sk->sk_receive_queue);
2588                         }
2589                 }
2590         }
2591         return skb;
2592 }
2593 #endif
2594
2595 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2596 {
2597         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2598                 return -ENOTCONN;
2599
2600         return unix_read_skb(sk, recv_actor);
2601 }
2602
2603 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2604                                     bool freezable)
2605 {
2606         struct scm_cookie scm;
2607         struct socket *sock = state->socket;
2608         struct sock *sk = sock->sk;
2609         struct unix_sock *u = unix_sk(sk);
2610         int copied = 0;
2611         int flags = state->flags;
2612         int noblock = flags & MSG_DONTWAIT;
2613         bool check_creds = false;
2614         int target;
2615         int err = 0;
2616         long timeo;
2617         int skip;
2618         size_t size = state->size;
2619         unsigned int last_len;
2620
2621         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2622                 err = -EINVAL;
2623                 goto out;
2624         }
2625
2626         if (unlikely(flags & MSG_OOB)) {
2627                 err = -EOPNOTSUPP;
2628 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2629                 err = unix_stream_recv_urg(state);
2630 #endif
2631                 goto out;
2632         }
2633
2634         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2635         timeo = sock_rcvtimeo(sk, noblock);
2636
2637         memset(&scm, 0, sizeof(scm));
2638
2639         /* Lock the socket to prevent queue disordering
2640          * while sleeps in memcpy_tomsg
2641          */
2642         mutex_lock(&u->iolock);
2643
2644         skip = max(sk_peek_offset(sk, flags), 0);
2645
2646         do {
2647                 int chunk;
2648                 bool drop_skb;
2649                 struct sk_buff *skb, *last;
2650
2651 redo:
2652                 unix_state_lock(sk);
2653                 if (sock_flag(sk, SOCK_DEAD)) {
2654                         err = -ECONNRESET;
2655                         goto unlock;
2656                 }
2657                 last = skb = skb_peek(&sk->sk_receive_queue);
2658                 last_len = last ? last->len : 0;
2659
2660 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2661                 if (skb) {
2662                         skb = manage_oob(skb, sk, flags, copied);
2663                         if (!skb) {
2664                                 unix_state_unlock(sk);
2665                                 if (copied)
2666                                         break;
2667                                 goto redo;
2668                         }
2669                 }
2670 #endif
2671 again:
2672                 if (skb == NULL) {
2673                         if (copied >= target)
2674                                 goto unlock;
2675
2676                         /*
2677                          *      POSIX 1003.1g mandates this order.
2678                          */
2679
2680                         err = sock_error(sk);
2681                         if (err)
2682                                 goto unlock;
2683                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2684                                 goto unlock;
2685
2686                         unix_state_unlock(sk);
2687                         if (!timeo) {
2688                                 err = -EAGAIN;
2689                                 break;
2690                         }
2691
2692                         mutex_unlock(&u->iolock);
2693
2694                         timeo = unix_stream_data_wait(sk, timeo, last,
2695                                                       last_len, freezable);
2696
2697                         if (signal_pending(current)) {
2698                                 err = sock_intr_errno(timeo);
2699                                 scm_destroy(&scm);
2700                                 goto out;
2701                         }
2702
2703                         mutex_lock(&u->iolock);
2704                         goto redo;
2705 unlock:
2706                         unix_state_unlock(sk);
2707                         break;
2708                 }
2709
2710                 while (skip >= unix_skb_len(skb)) {
2711                         skip -= unix_skb_len(skb);
2712                         last = skb;
2713                         last_len = skb->len;
2714                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2715                         if (!skb)
2716                                 goto again;
2717                 }
2718
2719                 unix_state_unlock(sk);
2720
2721                 if (check_creds) {
2722                         /* Never glue messages from different writers */
2723                         if (!unix_skb_scm_eq(skb, &scm))
2724                                 break;
2725                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2726                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2727                         /* Copy credentials */
2728                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2729                         unix_set_secdata(&scm, skb);
2730                         check_creds = true;
2731                 }
2732
2733                 /* Copy address just once */
2734                 if (state->msg && state->msg->msg_name) {
2735                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2736                                          state->msg->msg_name);
2737                         unix_copy_addr(state->msg, skb->sk);
2738                         sunaddr = NULL;
2739                 }
2740
2741                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2742                 skb_get(skb);
2743                 chunk = state->recv_actor(skb, skip, chunk, state);
2744                 drop_skb = !unix_skb_len(skb);
2745                 /* skb is only safe to use if !drop_skb */
2746                 consume_skb(skb);
2747                 if (chunk < 0) {
2748                         if (copied == 0)
2749                                 copied = -EFAULT;
2750                         break;
2751                 }
2752                 copied += chunk;
2753                 size -= chunk;
2754
2755                 if (drop_skb) {
2756                         /* the skb was touched by a concurrent reader;
2757                          * we should not expect anything from this skb
2758                          * anymore and assume it invalid - we can be
2759                          * sure it was dropped from the socket queue
2760                          *
2761                          * let's report a short read
2762                          */
2763                         err = 0;
2764                         break;
2765                 }
2766
2767                 /* Mark read part of skb as used */
2768                 if (!(flags & MSG_PEEK)) {
2769                         UNIXCB(skb).consumed += chunk;
2770
2771                         sk_peek_offset_bwd(sk, chunk);
2772
2773                         if (UNIXCB(skb).fp) {
2774                                 scm_stat_del(sk, skb);
2775                                 unix_detach_fds(&scm, skb);
2776                         }
2777
2778                         if (unix_skb_len(skb))
2779                                 break;
2780
2781                         skb_unlink(skb, &sk->sk_receive_queue);
2782                         consume_skb(skb);
2783
2784                         if (scm.fp)
2785                                 break;
2786                 } else {
2787                         /* It is questionable, see note in unix_dgram_recvmsg.
2788                          */
2789                         if (UNIXCB(skb).fp)
2790                                 unix_peek_fds(&scm, skb);
2791
2792                         sk_peek_offset_fwd(sk, chunk);
2793
2794                         if (UNIXCB(skb).fp)
2795                                 break;
2796
2797                         skip = 0;
2798                         last = skb;
2799                         last_len = skb->len;
2800                         unix_state_lock(sk);
2801                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2802                         if (skb)
2803                                 goto again;
2804                         unix_state_unlock(sk);
2805                         break;
2806                 }
2807         } while (size);
2808
2809         mutex_unlock(&u->iolock);
2810         if (state->msg)
2811                 scm_recv_unix(sock, state->msg, &scm, flags);
2812         else
2813                 scm_destroy(&scm);
2814 out:
2815         return copied ? : err;
2816 }
2817
2818 static int unix_stream_read_actor(struct sk_buff *skb,
2819                                   int skip, int chunk,
2820                                   struct unix_stream_read_state *state)
2821 {
2822         int ret;
2823
2824         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2825                                     state->msg, chunk);
2826         return ret ?: chunk;
2827 }
2828
2829 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2830                           size_t size, int flags)
2831 {
2832         struct unix_stream_read_state state = {
2833                 .recv_actor = unix_stream_read_actor,
2834                 .socket = sk->sk_socket,
2835                 .msg = msg,
2836                 .size = size,
2837                 .flags = flags
2838         };
2839
2840         return unix_stream_read_generic(&state, true);
2841 }
2842
2843 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2844                                size_t size, int flags)
2845 {
2846         struct unix_stream_read_state state = {
2847                 .recv_actor = unix_stream_read_actor,
2848                 .socket = sock,
2849                 .msg = msg,
2850                 .size = size,
2851                 .flags = flags
2852         };
2853
2854 #ifdef CONFIG_BPF_SYSCALL
2855         struct sock *sk = sock->sk;
2856         const struct proto *prot = READ_ONCE(sk->sk_prot);
2857
2858         if (prot != &unix_stream_proto)
2859                 return prot->recvmsg(sk, msg, size, flags, NULL);
2860 #endif
2861         return unix_stream_read_generic(&state, true);
2862 }
2863
2864 static int unix_stream_splice_actor(struct sk_buff *skb,
2865                                     int skip, int chunk,
2866                                     struct unix_stream_read_state *state)
2867 {
2868         return skb_splice_bits(skb, state->socket->sk,
2869                                UNIXCB(skb).consumed + skip,
2870                                state->pipe, chunk, state->splice_flags);
2871 }
2872
2873 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2874                                        struct pipe_inode_info *pipe,
2875                                        size_t size, unsigned int flags)
2876 {
2877         struct unix_stream_read_state state = {
2878                 .recv_actor = unix_stream_splice_actor,
2879                 .socket = sock,
2880                 .pipe = pipe,
2881                 .size = size,
2882                 .splice_flags = flags,
2883         };
2884
2885         if (unlikely(*ppos))
2886                 return -ESPIPE;
2887
2888         if (sock->file->f_flags & O_NONBLOCK ||
2889             flags & SPLICE_F_NONBLOCK)
2890                 state.flags = MSG_DONTWAIT;
2891
2892         return unix_stream_read_generic(&state, false);
2893 }
2894
2895 static int unix_shutdown(struct socket *sock, int mode)
2896 {
2897         struct sock *sk = sock->sk;
2898         struct sock *other;
2899
2900         if (mode < SHUT_RD || mode > SHUT_RDWR)
2901                 return -EINVAL;
2902         /* This maps:
2903          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2904          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2905          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2906          */
2907         ++mode;
2908
2909         unix_state_lock(sk);
2910         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2911         other = unix_peer(sk);
2912         if (other)
2913                 sock_hold(other);
2914         unix_state_unlock(sk);
2915         sk->sk_state_change(sk);
2916
2917         if (other &&
2918                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2919
2920                 int peer_mode = 0;
2921                 const struct proto *prot = READ_ONCE(other->sk_prot);
2922
2923                 if (prot->unhash)
2924                         prot->unhash(other);
2925                 if (mode&RCV_SHUTDOWN)
2926                         peer_mode |= SEND_SHUTDOWN;
2927                 if (mode&SEND_SHUTDOWN)
2928                         peer_mode |= RCV_SHUTDOWN;
2929                 unix_state_lock(other);
2930                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2931                 unix_state_unlock(other);
2932                 other->sk_state_change(other);
2933                 if (peer_mode == SHUTDOWN_MASK)
2934                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2935                 else if (peer_mode & RCV_SHUTDOWN)
2936                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2937         }
2938         if (other)
2939                 sock_put(other);
2940
2941         return 0;
2942 }
2943
2944 long unix_inq_len(struct sock *sk)
2945 {
2946         struct sk_buff *skb;
2947         long amount = 0;
2948
2949         if (sk->sk_state == TCP_LISTEN)
2950                 return -EINVAL;
2951
2952         spin_lock(&sk->sk_receive_queue.lock);
2953         if (sk->sk_type == SOCK_STREAM ||
2954             sk->sk_type == SOCK_SEQPACKET) {
2955                 skb_queue_walk(&sk->sk_receive_queue, skb)
2956                         amount += unix_skb_len(skb);
2957         } else {
2958                 skb = skb_peek(&sk->sk_receive_queue);
2959                 if (skb)
2960                         amount = skb->len;
2961         }
2962         spin_unlock(&sk->sk_receive_queue.lock);
2963
2964         return amount;
2965 }
2966 EXPORT_SYMBOL_GPL(unix_inq_len);
2967
2968 long unix_outq_len(struct sock *sk)
2969 {
2970         return sk_wmem_alloc_get(sk);
2971 }
2972 EXPORT_SYMBOL_GPL(unix_outq_len);
2973
2974 static int unix_open_file(struct sock *sk)
2975 {
2976         struct path path;
2977         struct file *f;
2978         int fd;
2979
2980         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2981                 return -EPERM;
2982
2983         if (!smp_load_acquire(&unix_sk(sk)->addr))
2984                 return -ENOENT;
2985
2986         path = unix_sk(sk)->path;
2987         if (!path.dentry)
2988                 return -ENOENT;
2989
2990         path_get(&path);
2991
2992         fd = get_unused_fd_flags(O_CLOEXEC);
2993         if (fd < 0)
2994                 goto out;
2995
2996         f = dentry_open(&path, O_PATH, current_cred());
2997         if (IS_ERR(f)) {
2998                 put_unused_fd(fd);
2999                 fd = PTR_ERR(f);
3000                 goto out;
3001         }
3002
3003         fd_install(fd, f);
3004 out:
3005         path_put(&path);
3006
3007         return fd;
3008 }
3009
3010 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3011 {
3012         struct sock *sk = sock->sk;
3013         long amount = 0;
3014         int err;
3015
3016         switch (cmd) {
3017         case SIOCOUTQ:
3018                 amount = unix_outq_len(sk);
3019                 err = put_user(amount, (int __user *)arg);
3020                 break;
3021         case SIOCINQ:
3022                 amount = unix_inq_len(sk);
3023                 if (amount < 0)
3024                         err = amount;
3025                 else
3026                         err = put_user(amount, (int __user *)arg);
3027                 break;
3028         case SIOCUNIXFILE:
3029                 err = unix_open_file(sk);
3030                 break;
3031 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3032         case SIOCATMARK:
3033                 {
3034                         struct sk_buff *skb;
3035                         int answ = 0;
3036
3037                         skb = skb_peek(&sk->sk_receive_queue);
3038                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3039                                 answ = 1;
3040                         err = put_user(answ, (int __user *)arg);
3041                 }
3042                 break;
3043 #endif
3044         default:
3045                 err = -ENOIOCTLCMD;
3046                 break;
3047         }
3048         return err;
3049 }
3050
3051 #ifdef CONFIG_COMPAT
3052 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3053 {
3054         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3055 }
3056 #endif
3057
3058 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3059 {
3060         struct sock *sk = sock->sk;
3061         __poll_t mask;
3062         u8 shutdown;
3063
3064         sock_poll_wait(file, sock, wait);
3065         mask = 0;
3066         shutdown = READ_ONCE(sk->sk_shutdown);
3067
3068         /* exceptional events? */
3069         if (READ_ONCE(sk->sk_err))
3070                 mask |= EPOLLERR;
3071         if (shutdown == SHUTDOWN_MASK)
3072                 mask |= EPOLLHUP;
3073         if (shutdown & RCV_SHUTDOWN)
3074                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3075
3076         /* readable? */
3077         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3078                 mask |= EPOLLIN | EPOLLRDNORM;
3079         if (sk_is_readable(sk))
3080                 mask |= EPOLLIN | EPOLLRDNORM;
3081 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3082         if (READ_ONCE(unix_sk(sk)->oob_skb))
3083                 mask |= EPOLLPRI;
3084 #endif
3085
3086         /* Connection-based need to check for termination and startup */
3087         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3088             sk->sk_state == TCP_CLOSE)
3089                 mask |= EPOLLHUP;
3090
3091         /*
3092          * we set writable also when the other side has shut down the
3093          * connection. This prevents stuck sockets.
3094          */
3095         if (unix_writable(sk))
3096                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3097
3098         return mask;
3099 }
3100
3101 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3102                                     poll_table *wait)
3103 {
3104         struct sock *sk = sock->sk, *other;
3105         unsigned int writable;
3106         __poll_t mask;
3107         u8 shutdown;
3108
3109         sock_poll_wait(file, sock, wait);
3110         mask = 0;
3111         shutdown = READ_ONCE(sk->sk_shutdown);
3112
3113         /* exceptional events? */
3114         if (READ_ONCE(sk->sk_err) ||
3115             !skb_queue_empty_lockless(&sk->sk_error_queue))
3116                 mask |= EPOLLERR |
3117                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3118
3119         if (shutdown & RCV_SHUTDOWN)
3120                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3121         if (shutdown == SHUTDOWN_MASK)
3122                 mask |= EPOLLHUP;
3123
3124         /* readable? */
3125         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3126                 mask |= EPOLLIN | EPOLLRDNORM;
3127         if (sk_is_readable(sk))
3128                 mask |= EPOLLIN | EPOLLRDNORM;
3129
3130         /* Connection-based need to check for termination and startup */
3131         if (sk->sk_type == SOCK_SEQPACKET) {
3132                 if (sk->sk_state == TCP_CLOSE)
3133                         mask |= EPOLLHUP;
3134                 /* connection hasn't started yet? */
3135                 if (sk->sk_state == TCP_SYN_SENT)
3136                         return mask;
3137         }
3138
3139         /* No write status requested, avoid expensive OUT tests. */
3140         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3141                 return mask;
3142
3143         writable = unix_writable(sk);
3144         if (writable) {
3145                 unix_state_lock(sk);
3146
3147                 other = unix_peer(sk);
3148                 if (other && unix_peer(other) != sk &&
3149                     unix_recvq_full_lockless(other) &&
3150                     unix_dgram_peer_wake_me(sk, other))
3151                         writable = 0;
3152
3153                 unix_state_unlock(sk);
3154         }
3155
3156         if (writable)
3157                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3158         else
3159                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3160
3161         return mask;
3162 }
3163
3164 #ifdef CONFIG_PROC_FS
3165
3166 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3167
3168 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3169 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3170 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3171
3172 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3173 {
3174         unsigned long offset = get_offset(*pos);
3175         unsigned long bucket = get_bucket(*pos);
3176         unsigned long count = 0;
3177         struct sock *sk;
3178
3179         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3180              sk; sk = sk_next(sk)) {
3181                 if (++count == offset)
3182                         break;
3183         }
3184
3185         return sk;
3186 }
3187
3188 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3189 {
3190         unsigned long bucket = get_bucket(*pos);
3191         struct net *net = seq_file_net(seq);
3192         struct sock *sk;
3193
3194         while (bucket < UNIX_HASH_SIZE) {
3195                 spin_lock(&net->unx.table.locks[bucket]);
3196
3197                 sk = unix_from_bucket(seq, pos);
3198                 if (sk)
3199                         return sk;
3200
3201                 spin_unlock(&net->unx.table.locks[bucket]);
3202
3203                 *pos = set_bucket_offset(++bucket, 1);
3204         }
3205
3206         return NULL;
3207 }
3208
3209 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3210                                   loff_t *pos)
3211 {
3212         unsigned long bucket = get_bucket(*pos);
3213
3214         sk = sk_next(sk);
3215         if (sk)
3216                 return sk;
3217
3218
3219         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3220
3221         *pos = set_bucket_offset(++bucket, 1);
3222
3223         return unix_get_first(seq, pos);
3224 }
3225
3226 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3227 {
3228         if (!*pos)
3229                 return SEQ_START_TOKEN;
3230
3231         return unix_get_first(seq, pos);
3232 }
3233
3234 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3235 {
3236         ++*pos;
3237
3238         if (v == SEQ_START_TOKEN)
3239                 return unix_get_first(seq, pos);
3240
3241         return unix_get_next(seq, v, pos);
3242 }
3243
3244 static void unix_seq_stop(struct seq_file *seq, void *v)
3245 {
3246         struct sock *sk = v;
3247
3248         if (sk)
3249                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3250 }
3251
3252 static int unix_seq_show(struct seq_file *seq, void *v)
3253 {
3254
3255         if (v == SEQ_START_TOKEN)
3256                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3257                          "Inode Path\n");
3258         else {
3259                 struct sock *s = v;
3260                 struct unix_sock *u = unix_sk(s);
3261                 unix_state_lock(s);
3262
3263                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3264                         s,
3265                         refcount_read(&s->sk_refcnt),
3266                         0,
3267                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3268                         s->sk_type,
3269                         s->sk_socket ?
3270                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3271                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3272                         sock_i_ino(s));
3273
3274                 if (u->addr) {  // under a hash table lock here
3275                         int i, len;
3276                         seq_putc(seq, ' ');
3277
3278                         i = 0;
3279                         len = u->addr->len -
3280                                 offsetof(struct sockaddr_un, sun_path);
3281                         if (u->addr->name->sun_path[0]) {
3282                                 len--;
3283                         } else {
3284                                 seq_putc(seq, '@');
3285                                 i++;
3286                         }
3287                         for ( ; i < len; i++)
3288                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3289                                          '@');
3290                 }
3291                 unix_state_unlock(s);
3292                 seq_putc(seq, '\n');
3293         }
3294
3295         return 0;
3296 }
3297
3298 static const struct seq_operations unix_seq_ops = {
3299         .start  = unix_seq_start,
3300         .next   = unix_seq_next,
3301         .stop   = unix_seq_stop,
3302         .show   = unix_seq_show,
3303 };
3304
3305 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3306 struct bpf_unix_iter_state {
3307         struct seq_net_private p;
3308         unsigned int cur_sk;
3309         unsigned int end_sk;
3310         unsigned int max_sk;
3311         struct sock **batch;
3312         bool st_bucket_done;
3313 };
3314
3315 struct bpf_iter__unix {
3316         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3317         __bpf_md_ptr(struct unix_sock *, unix_sk);
3318         uid_t uid __aligned(8);
3319 };
3320
3321 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3322                               struct unix_sock *unix_sk, uid_t uid)
3323 {
3324         struct bpf_iter__unix ctx;
3325
3326         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3327         ctx.meta = meta;
3328         ctx.unix_sk = unix_sk;
3329         ctx.uid = uid;
3330         return bpf_iter_run_prog(prog, &ctx);
3331 }
3332
3333 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3334
3335 {
3336         struct bpf_unix_iter_state *iter = seq->private;
3337         unsigned int expected = 1;
3338         struct sock *sk;
3339
3340         sock_hold(start_sk);
3341         iter->batch[iter->end_sk++] = start_sk;
3342
3343         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3344                 if (iter->end_sk < iter->max_sk) {
3345                         sock_hold(sk);
3346                         iter->batch[iter->end_sk++] = sk;
3347                 }
3348
3349                 expected++;
3350         }
3351
3352         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3353
3354         return expected;
3355 }
3356
3357 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3358 {
3359         while (iter->cur_sk < iter->end_sk)
3360                 sock_put(iter->batch[iter->cur_sk++]);
3361 }
3362
3363 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3364                                        unsigned int new_batch_sz)
3365 {
3366         struct sock **new_batch;
3367
3368         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3369                              GFP_USER | __GFP_NOWARN);
3370         if (!new_batch)
3371                 return -ENOMEM;
3372
3373         bpf_iter_unix_put_batch(iter);
3374         kvfree(iter->batch);
3375         iter->batch = new_batch;
3376         iter->max_sk = new_batch_sz;
3377
3378         return 0;
3379 }
3380
3381 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3382                                         loff_t *pos)
3383 {
3384         struct bpf_unix_iter_state *iter = seq->private;
3385         unsigned int expected;
3386         bool resized = false;
3387         struct sock *sk;
3388
3389         if (iter->st_bucket_done)
3390                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3391
3392 again:
3393         /* Get a new batch */
3394         iter->cur_sk = 0;
3395         iter->end_sk = 0;
3396
3397         sk = unix_get_first(seq, pos);
3398         if (!sk)
3399                 return NULL; /* Done */
3400
3401         expected = bpf_iter_unix_hold_batch(seq, sk);
3402
3403         if (iter->end_sk == expected) {
3404                 iter->st_bucket_done = true;
3405                 return sk;
3406         }
3407
3408         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3409                 resized = true;
3410                 goto again;
3411         }
3412
3413         return sk;
3414 }
3415
3416 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3417 {
3418         if (!*pos)
3419                 return SEQ_START_TOKEN;
3420
3421         /* bpf iter does not support lseek, so it always
3422          * continue from where it was stop()-ped.
3423          */
3424         return bpf_iter_unix_batch(seq, pos);
3425 }
3426
3427 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3428 {
3429         struct bpf_unix_iter_state *iter = seq->private;
3430         struct sock *sk;
3431
3432         /* Whenever seq_next() is called, the iter->cur_sk is
3433          * done with seq_show(), so advance to the next sk in
3434          * the batch.
3435          */
3436         if (iter->cur_sk < iter->end_sk)
3437                 sock_put(iter->batch[iter->cur_sk++]);
3438
3439         ++*pos;
3440
3441         if (iter->cur_sk < iter->end_sk)
3442                 sk = iter->batch[iter->cur_sk];
3443         else
3444                 sk = bpf_iter_unix_batch(seq, pos);
3445
3446         return sk;
3447 }
3448
3449 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3450 {
3451         struct bpf_iter_meta meta;
3452         struct bpf_prog *prog;
3453         struct sock *sk = v;
3454         uid_t uid;
3455         bool slow;
3456         int ret;
3457
3458         if (v == SEQ_START_TOKEN)
3459                 return 0;
3460
3461         slow = lock_sock_fast(sk);
3462
3463         if (unlikely(sk_unhashed(sk))) {
3464                 ret = SEQ_SKIP;
3465                 goto unlock;
3466         }
3467
3468         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3469         meta.seq = seq;
3470         prog = bpf_iter_get_info(&meta, false);
3471         ret = unix_prog_seq_show(prog, &meta, v, uid);
3472 unlock:
3473         unlock_sock_fast(sk, slow);
3474         return ret;
3475 }
3476
3477 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3478 {
3479         struct bpf_unix_iter_state *iter = seq->private;
3480         struct bpf_iter_meta meta;
3481         struct bpf_prog *prog;
3482
3483         if (!v) {
3484                 meta.seq = seq;
3485                 prog = bpf_iter_get_info(&meta, true);
3486                 if (prog)
3487                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3488         }
3489
3490         if (iter->cur_sk < iter->end_sk)
3491                 bpf_iter_unix_put_batch(iter);
3492 }
3493
3494 static const struct seq_operations bpf_iter_unix_seq_ops = {
3495         .start  = bpf_iter_unix_seq_start,
3496         .next   = bpf_iter_unix_seq_next,
3497         .stop   = bpf_iter_unix_seq_stop,
3498         .show   = bpf_iter_unix_seq_show,
3499 };
3500 #endif
3501 #endif
3502
3503 static const struct net_proto_family unix_family_ops = {
3504         .family = PF_UNIX,
3505         .create = unix_create,
3506         .owner  = THIS_MODULE,
3507 };
3508
3509
3510 static int __net_init unix_net_init(struct net *net)
3511 {
3512         int i;
3513
3514         net->unx.sysctl_max_dgram_qlen = 10;
3515         if (unix_sysctl_register(net))
3516                 goto out;
3517
3518 #ifdef CONFIG_PROC_FS
3519         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3520                              sizeof(struct seq_net_private)))
3521                 goto err_sysctl;
3522 #endif
3523
3524         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3525                                               sizeof(spinlock_t), GFP_KERNEL);
3526         if (!net->unx.table.locks)
3527                 goto err_proc;
3528
3529         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3530                                                 sizeof(struct hlist_head),
3531                                                 GFP_KERNEL);
3532         if (!net->unx.table.buckets)
3533                 goto free_locks;
3534
3535         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3536                 spin_lock_init(&net->unx.table.locks[i]);
3537                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3538         }
3539
3540         return 0;
3541
3542 free_locks:
3543         kvfree(net->unx.table.locks);
3544 err_proc:
3545 #ifdef CONFIG_PROC_FS
3546         remove_proc_entry("unix", net->proc_net);
3547 err_sysctl:
3548 #endif
3549         unix_sysctl_unregister(net);
3550 out:
3551         return -ENOMEM;
3552 }
3553
3554 static void __net_exit unix_net_exit(struct net *net)
3555 {
3556         kvfree(net->unx.table.buckets);
3557         kvfree(net->unx.table.locks);
3558         unix_sysctl_unregister(net);
3559         remove_proc_entry("unix", net->proc_net);
3560 }
3561
3562 static struct pernet_operations unix_net_ops = {
3563         .init = unix_net_init,
3564         .exit = unix_net_exit,
3565 };
3566
3567 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3568 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3569                      struct unix_sock *unix_sk, uid_t uid)
3570
3571 #define INIT_BATCH_SZ 16
3572
3573 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3574 {
3575         struct bpf_unix_iter_state *iter = priv_data;
3576         int err;
3577
3578         err = bpf_iter_init_seq_net(priv_data, aux);
3579         if (err)
3580                 return err;
3581
3582         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3583         if (err) {
3584                 bpf_iter_fini_seq_net(priv_data);
3585                 return err;
3586         }
3587
3588         return 0;
3589 }
3590
3591 static void bpf_iter_fini_unix(void *priv_data)
3592 {
3593         struct bpf_unix_iter_state *iter = priv_data;
3594
3595         bpf_iter_fini_seq_net(priv_data);
3596         kvfree(iter->batch);
3597 }
3598
3599 static const struct bpf_iter_seq_info unix_seq_info = {
3600         .seq_ops                = &bpf_iter_unix_seq_ops,
3601         .init_seq_private       = bpf_iter_init_unix,
3602         .fini_seq_private       = bpf_iter_fini_unix,
3603         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3604 };
3605
3606 static const struct bpf_func_proto *
3607 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3608                              const struct bpf_prog *prog)
3609 {
3610         switch (func_id) {
3611         case BPF_FUNC_setsockopt:
3612                 return &bpf_sk_setsockopt_proto;
3613         case BPF_FUNC_getsockopt:
3614                 return &bpf_sk_getsockopt_proto;
3615         default:
3616                 return NULL;
3617         }
3618 }
3619
3620 static struct bpf_iter_reg unix_reg_info = {
3621         .target                 = "unix",
3622         .ctx_arg_info_size      = 1,
3623         .ctx_arg_info           = {
3624                 { offsetof(struct bpf_iter__unix, unix_sk),
3625                   PTR_TO_BTF_ID_OR_NULL },
3626         },
3627         .get_func_proto         = bpf_iter_unix_get_func_proto,
3628         .seq_info               = &unix_seq_info,
3629 };
3630
3631 static void __init bpf_iter_register(void)
3632 {
3633         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3634         if (bpf_iter_reg_target(&unix_reg_info))
3635                 pr_warn("Warning: could not register bpf iterator unix\n");
3636 }
3637 #endif
3638
3639 static int __init af_unix_init(void)
3640 {
3641         int i, rc = -1;
3642
3643         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3644
3645         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3646                 spin_lock_init(&bsd_socket_locks[i]);
3647                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3648         }
3649
3650         rc = proto_register(&unix_dgram_proto, 1);
3651         if (rc != 0) {
3652                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3653                 goto out;
3654         }
3655
3656         rc = proto_register(&unix_stream_proto, 1);
3657         if (rc != 0) {
3658                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3659                 proto_unregister(&unix_dgram_proto);
3660                 goto out;
3661         }
3662
3663         sock_register(&unix_family_ops);
3664         register_pernet_subsys(&unix_net_ops);
3665         unix_bpf_build_proto();
3666
3667 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3668         bpf_iter_register();
3669 #endif
3670
3671 out:
3672         return rc;
3673 }
3674
3675 static void __exit af_unix_exit(void)
3676 {
3677         sock_unregister(PF_UNIX);
3678         proto_unregister(&unix_dgram_proto);
3679         proto_unregister(&unix_stream_proto);
3680         unregister_pernet_subsys(&unix_net_ops);
3681 }
3682
3683 /* Earlier than device_initcall() so that other drivers invoking
3684    request_module() don't end up in a loop when modprobe tries
3685    to use a UNIX socket. But later than subsys_initcall() because
3686    we depend on stuff initialised there */
3687 fs_initcall(af_unix_init);
3688 module_exit(af_unix_exit);
3689
3690 MODULE_LICENSE("GPL");
3691 MODULE_ALIAS_NETPROTO(PF_UNIX);