1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * NET4: Implementation of BSD Unix domain sockets.
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
48 * Known differences from reference BSD that was tested:
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
126 /* SMP locking strategy:
127 * hash table is protected with spinlock.
128 * each socket state is protected by separate spinlock.
131 static unsigned int unix_unbound_hash(struct sock *sk)
133 unsigned long hash = (unsigned long)sk;
139 return hash & UNIX_HASH_MOD;
142 static unsigned int unix_bsd_hash(struct inode *i)
144 return i->i_ino & UNIX_HASH_MOD;
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 int addr_len, int type)
150 __wsum csum = csum_partial(sunaddr, addr_len, 0);
153 hash = (__force unsigned int)csum_fold(csum);
157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
160 static void unix_table_double_lock(struct net *net,
161 unsigned int hash1, unsigned int hash2)
163 if (hash1 == hash2) {
164 spin_lock(&net->unx.table.locks[hash1]);
171 spin_lock(&net->unx.table.locks[hash1]);
172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
175 static void unix_table_double_unlock(struct net *net,
176 unsigned int hash1, unsigned int hash2)
178 if (hash1 == hash2) {
179 spin_unlock(&net->unx.table.locks[hash1]);
183 spin_unlock(&net->unx.table.locks[hash1]);
184 spin_unlock(&net->unx.table.locks[hash2]);
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
190 UNIXCB(skb).secid = scm->secid;
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
195 scm->secid = UNIXCB(skb).secid;
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
200 return (scm->secid == UNIXCB(skb).secid);
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
213 #endif /* CONFIG_SECURITY_NETWORK */
215 #define unix_peer(sk) (unix_sk(sk)->peer)
217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
219 return unix_peer(osk) == sk;
222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
224 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
227 static inline int unix_recvq_full(const struct sock *sk)
229 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
232 static inline int unix_recvq_full_lockless(const struct sock *sk)
234 return skb_queue_len_lockless(&sk->sk_receive_queue) >
235 READ_ONCE(sk->sk_max_ack_backlog);
238 struct sock *unix_peer_get(struct sock *s)
246 unix_state_unlock(s);
249 EXPORT_SYMBOL_GPL(unix_peer_get);
251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
254 struct unix_address *addr;
256 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
260 refcount_set(&addr->refcnt, 1);
261 addr->len = addr_len;
262 memcpy(addr->name, sunaddr, addr_len);
267 static inline void unix_release_addr(struct unix_address *addr)
269 if (refcount_dec_and_test(&addr->refcnt))
274 * Check unix socket name:
275 * - should be not zero length.
276 * - if started by not zero, should be NULL terminated (FS object)
277 * - if started by zero, it is abstract name.
280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
282 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
283 addr_len > sizeof(*sunaddr))
286 if (sunaddr->sun_family != AF_UNIX)
292 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
294 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
295 short offset = offsetof(struct sockaddr_storage, __data);
297 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
299 /* This may look like an off by one error but it is a bit more
300 * subtle. 108 is the longest valid AF_UNIX path for a binding.
301 * sun_path[108] doesn't as such exist. However in kernel space
302 * we are guaranteed that it is a valid memory location in our
303 * kernel address buffer because syscall functions always pass
304 * a pointer of struct sockaddr_storage which has a bigger buffer
305 * than 108. Also, we must terminate sun_path for strlen() in
308 addr->__data[addr_len - offset] = 0;
310 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
311 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
312 * know the actual buffer.
314 return strlen(addr->__data) + offset + 1;
317 static void __unix_remove_socket(struct sock *sk)
319 sk_del_node_init(sk);
322 static void __unix_insert_socket(struct net *net, struct sock *sk)
324 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
325 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
328 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
329 struct unix_address *addr, unsigned int hash)
331 __unix_remove_socket(sk);
332 smp_store_release(&unix_sk(sk)->addr, addr);
335 __unix_insert_socket(net, sk);
338 static void unix_remove_socket(struct net *net, struct sock *sk)
340 spin_lock(&net->unx.table.locks[sk->sk_hash]);
341 __unix_remove_socket(sk);
342 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
345 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
347 spin_lock(&net->unx.table.locks[sk->sk_hash]);
348 __unix_insert_socket(net, sk);
349 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
352 static void unix_insert_bsd_socket(struct sock *sk)
354 spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
356 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
359 static void unix_remove_bsd_socket(struct sock *sk)
361 if (!hlist_unhashed(&sk->sk_bind_node)) {
362 spin_lock(&bsd_socket_locks[sk->sk_hash]);
363 __sk_del_bind_node(sk);
364 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
366 sk_node_init(&sk->sk_bind_node);
370 static struct sock *__unix_find_socket_byname(struct net *net,
371 struct sockaddr_un *sunname,
372 int len, unsigned int hash)
376 sk_for_each(s, &net->unx.table.buckets[hash]) {
377 struct unix_sock *u = unix_sk(s);
379 if (u->addr->len == len &&
380 !memcmp(u->addr->name, sunname, len))
386 static inline struct sock *unix_find_socket_byname(struct net *net,
387 struct sockaddr_un *sunname,
388 int len, unsigned int hash)
392 spin_lock(&net->unx.table.locks[hash]);
393 s = __unix_find_socket_byname(net, sunname, len, hash);
396 spin_unlock(&net->unx.table.locks[hash]);
400 static struct sock *unix_find_socket_byinode(struct inode *i)
402 unsigned int hash = unix_bsd_hash(i);
405 spin_lock(&bsd_socket_locks[hash]);
406 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
407 struct dentry *dentry = unix_sk(s)->path.dentry;
409 if (dentry && d_backing_inode(dentry) == i) {
411 spin_unlock(&bsd_socket_locks[hash]);
415 spin_unlock(&bsd_socket_locks[hash]);
419 /* Support code for asymmetrically connected dgram sockets
421 * If a datagram socket is connected to a socket not itself connected
422 * to the first socket (eg, /dev/log), clients may only enqueue more
423 * messages if the present receive queue of the server socket is not
424 * "too large". This means there's a second writeability condition
425 * poll and sendmsg need to test. The dgram recv code will do a wake
426 * up on the peer_wait wait queue of a socket upon reception of a
427 * datagram which needs to be propagated to sleeping would-be writers
428 * since these might not have sent anything so far. This can't be
429 * accomplished via poll_wait because the lifetime of the server
430 * socket might be less than that of its clients if these break their
431 * association with it or if the server socket is closed while clients
432 * are still connected to it and there's no way to inform "a polling
433 * implementation" that it should let go of a certain wait queue
435 * In order to propagate a wake up, a wait_queue_entry_t of the client
436 * socket is enqueued on the peer_wait queue of the server socket
437 * whose wake function does a wake_up on the ordinary client socket
438 * wait queue. This connection is established whenever a write (or
439 * poll for write) hit the flow control condition and broken when the
440 * association to the server socket is dissolved or after a wake up
444 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
448 wait_queue_head_t *u_sleep;
450 u = container_of(q, struct unix_sock, peer_wake);
452 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
454 u->peer_wake.private = NULL;
456 /* relaying can only happen while the wq still exists */
457 u_sleep = sk_sleep(&u->sk);
459 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
464 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
466 struct unix_sock *u, *u_other;
470 u_other = unix_sk(other);
472 spin_lock(&u_other->peer_wait.lock);
474 if (!u->peer_wake.private) {
475 u->peer_wake.private = other;
476 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
481 spin_unlock(&u_other->peer_wait.lock);
485 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
488 struct unix_sock *u, *u_other;
491 u_other = unix_sk(other);
492 spin_lock(&u_other->peer_wait.lock);
494 if (u->peer_wake.private == other) {
495 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
496 u->peer_wake.private = NULL;
499 spin_unlock(&u_other->peer_wait.lock);
502 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
505 unix_dgram_peer_wake_disconnect(sk, other);
506 wake_up_interruptible_poll(sk_sleep(sk),
513 * - unix_peer(sk) == other
514 * - association is stable
516 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
520 connected = unix_dgram_peer_wake_connect(sk, other);
522 /* If other is SOCK_DEAD, we want to make sure we signal
523 * POLLOUT, such that a subsequent write() can get a
524 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
525 * to other and its full, we will hang waiting for POLLOUT.
527 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
531 unix_dgram_peer_wake_disconnect(sk, other);
536 static int unix_writable(const struct sock *sk)
538 return sk->sk_state != TCP_LISTEN &&
539 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
542 static void unix_write_space(struct sock *sk)
544 struct socket_wq *wq;
547 if (unix_writable(sk)) {
548 wq = rcu_dereference(sk->sk_wq);
549 if (skwq_has_sleeper(wq))
550 wake_up_interruptible_sync_poll(&wq->wait,
551 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
552 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
557 /* When dgram socket disconnects (or changes its peer), we clear its receive
558 * queue of packets arrived from previous peer. First, it allows to do
559 * flow control based only on wmem_alloc; second, sk connected to peer
560 * may receive messages only from that peer. */
561 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
563 if (!skb_queue_empty(&sk->sk_receive_queue)) {
564 skb_queue_purge(&sk->sk_receive_queue);
565 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
567 /* If one link of bidirectional dgram pipe is disconnected,
568 * we signal error. Messages are lost. Do not make this,
569 * when peer was not connected to us.
571 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
572 WRITE_ONCE(other->sk_err, ECONNRESET);
573 sk_error_report(other);
576 other->sk_state = TCP_CLOSE;
579 static void unix_sock_destructor(struct sock *sk)
581 struct unix_sock *u = unix_sk(sk);
583 skb_queue_purge(&sk->sk_receive_queue);
585 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
586 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
587 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
588 if (!sock_flag(sk, SOCK_DEAD)) {
589 pr_info("Attempt to release alive unix socket: %p\n", sk);
594 unix_release_addr(u->addr);
596 atomic_long_dec(&unix_nr_socks);
597 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
598 #ifdef UNIX_REFCNT_DEBUG
599 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
600 atomic_long_read(&unix_nr_socks));
604 static void unix_release_sock(struct sock *sk, int embrion)
606 struct unix_sock *u = unix_sk(sk);
612 unix_remove_socket(sock_net(sk), sk);
613 unix_remove_bsd_socket(sk);
618 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
620 u->path.dentry = NULL;
622 state = sk->sk_state;
623 sk->sk_state = TCP_CLOSE;
625 skpair = unix_peer(sk);
626 unix_peer(sk) = NULL;
628 unix_state_unlock(sk);
630 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
632 kfree_skb(u->oob_skb);
637 wake_up_interruptible_all(&u->peer_wait);
639 if (skpair != NULL) {
640 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
641 unix_state_lock(skpair);
643 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
644 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
645 WRITE_ONCE(skpair->sk_err, ECONNRESET);
646 unix_state_unlock(skpair);
647 skpair->sk_state_change(skpair);
648 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
651 unix_dgram_peer_wake_disconnect(sk, skpair);
652 sock_put(skpair); /* It may now die */
655 /* Try to flush out this socket. Throw out buffers at least */
657 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
658 if (state == TCP_LISTEN)
659 unix_release_sock(skb->sk, 1);
660 /* passed fds are erased in the kfree_skb hook */
661 UNIXCB(skb).consumed = skb->len;
670 /* ---- Socket is dead now and most probably destroyed ---- */
673 * Fixme: BSD difference: In BSD all sockets connected to us get
674 * ECONNRESET and we die on the spot. In Linux we behave
675 * like files and pipes do and wait for the last
678 * Can't we simply set sock->err?
680 * What the above comment does talk about? --ANK(980817)
683 if (unix_tot_inflight)
684 unix_gc(); /* Garbage collect fds */
687 static void init_peercred(struct sock *sk)
689 const struct cred *old_cred;
692 spin_lock(&sk->sk_peer_lock);
693 old_pid = sk->sk_peer_pid;
694 old_cred = sk->sk_peer_cred;
695 sk->sk_peer_pid = get_pid(task_tgid(current));
696 sk->sk_peer_cred = get_current_cred();
697 spin_unlock(&sk->sk_peer_lock);
703 static void copy_peercred(struct sock *sk, struct sock *peersk)
705 const struct cred *old_cred;
709 spin_lock(&sk->sk_peer_lock);
710 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
712 spin_lock(&peersk->sk_peer_lock);
713 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
715 old_pid = sk->sk_peer_pid;
716 old_cred = sk->sk_peer_cred;
717 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
718 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
720 spin_unlock(&sk->sk_peer_lock);
721 spin_unlock(&peersk->sk_peer_lock);
727 static int unix_listen(struct socket *sock, int backlog)
730 struct sock *sk = sock->sk;
731 struct unix_sock *u = unix_sk(sk);
734 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
735 goto out; /* Only stream/seqpacket sockets accept */
738 goto out; /* No listens on an unbound socket */
740 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
742 if (backlog > sk->sk_max_ack_backlog)
743 wake_up_interruptible_all(&u->peer_wait);
744 sk->sk_max_ack_backlog = backlog;
745 sk->sk_state = TCP_LISTEN;
746 /* set credentials so connect can copy them */
751 unix_state_unlock(sk);
756 static int unix_release(struct socket *);
757 static int unix_bind(struct socket *, struct sockaddr *, int);
758 static int unix_stream_connect(struct socket *, struct sockaddr *,
759 int addr_len, int flags);
760 static int unix_socketpair(struct socket *, struct socket *);
761 static int unix_accept(struct socket *, struct socket *, int, bool);
762 static int unix_getname(struct socket *, struct sockaddr *, int);
763 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
764 static __poll_t unix_dgram_poll(struct file *, struct socket *,
766 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
768 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
770 static int unix_shutdown(struct socket *, int);
771 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
772 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
773 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
774 struct pipe_inode_info *, size_t size,
776 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
777 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
778 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
779 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
780 static int unix_dgram_connect(struct socket *, struct sockaddr *,
782 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
783 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
786 static int unix_set_peek_off(struct sock *sk, int val)
788 struct unix_sock *u = unix_sk(sk);
790 if (mutex_lock_interruptible(&u->iolock))
793 WRITE_ONCE(sk->sk_peek_off, val);
794 mutex_unlock(&u->iolock);
799 #ifdef CONFIG_PROC_FS
800 static int unix_count_nr_fds(struct sock *sk)
806 spin_lock(&sk->sk_receive_queue.lock);
807 skb = skb_peek(&sk->sk_receive_queue);
809 u = unix_sk(skb->sk);
810 nr_fds += atomic_read(&u->scm_stat.nr_fds);
811 skb = skb_peek_next(skb, &sk->sk_receive_queue);
813 spin_unlock(&sk->sk_receive_queue.lock);
818 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
820 struct sock *sk = sock->sk;
821 unsigned char s_state;
826 s_state = READ_ONCE(sk->sk_state);
829 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
830 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
831 * SOCK_DGRAM is ordinary. So, no lock is needed.
833 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
834 nr_fds = atomic_read(&u->scm_stat.nr_fds);
835 else if (s_state == TCP_LISTEN)
836 nr_fds = unix_count_nr_fds(sk);
838 seq_printf(m, "scm_fds: %u\n", nr_fds);
842 #define unix_show_fdinfo NULL
845 static const struct proto_ops unix_stream_ops = {
847 .owner = THIS_MODULE,
848 .release = unix_release,
850 .connect = unix_stream_connect,
851 .socketpair = unix_socketpair,
852 .accept = unix_accept,
853 .getname = unix_getname,
857 .compat_ioctl = unix_compat_ioctl,
859 .listen = unix_listen,
860 .shutdown = unix_shutdown,
861 .sendmsg = unix_stream_sendmsg,
862 .recvmsg = unix_stream_recvmsg,
863 .read_skb = unix_stream_read_skb,
864 .mmap = sock_no_mmap,
865 .splice_read = unix_stream_splice_read,
866 .set_peek_off = unix_set_peek_off,
867 .show_fdinfo = unix_show_fdinfo,
870 static const struct proto_ops unix_dgram_ops = {
872 .owner = THIS_MODULE,
873 .release = unix_release,
875 .connect = unix_dgram_connect,
876 .socketpair = unix_socketpair,
877 .accept = sock_no_accept,
878 .getname = unix_getname,
879 .poll = unix_dgram_poll,
882 .compat_ioctl = unix_compat_ioctl,
884 .listen = sock_no_listen,
885 .shutdown = unix_shutdown,
886 .sendmsg = unix_dgram_sendmsg,
887 .read_skb = unix_read_skb,
888 .recvmsg = unix_dgram_recvmsg,
889 .mmap = sock_no_mmap,
890 .set_peek_off = unix_set_peek_off,
891 .show_fdinfo = unix_show_fdinfo,
894 static const struct proto_ops unix_seqpacket_ops = {
896 .owner = THIS_MODULE,
897 .release = unix_release,
899 .connect = unix_stream_connect,
900 .socketpair = unix_socketpair,
901 .accept = unix_accept,
902 .getname = unix_getname,
903 .poll = unix_dgram_poll,
906 .compat_ioctl = unix_compat_ioctl,
908 .listen = unix_listen,
909 .shutdown = unix_shutdown,
910 .sendmsg = unix_seqpacket_sendmsg,
911 .recvmsg = unix_seqpacket_recvmsg,
912 .mmap = sock_no_mmap,
913 .set_peek_off = unix_set_peek_off,
914 .show_fdinfo = unix_show_fdinfo,
917 static void unix_close(struct sock *sk, long timeout)
919 /* Nothing to do here, unix socket does not need a ->close().
920 * This is merely for sockmap.
924 static void unix_unhash(struct sock *sk)
926 /* Nothing to do here, unix socket does not need a ->unhash().
927 * This is merely for sockmap.
931 static bool unix_bpf_bypass_getsockopt(int level, int optname)
933 if (level == SOL_SOCKET) {
945 struct proto unix_dgram_proto = {
947 .owner = THIS_MODULE,
948 .obj_size = sizeof(struct unix_sock),
950 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
951 #ifdef CONFIG_BPF_SYSCALL
952 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
956 struct proto unix_stream_proto = {
957 .name = "UNIX-STREAM",
958 .owner = THIS_MODULE,
959 .obj_size = sizeof(struct unix_sock),
961 .unhash = unix_unhash,
962 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
963 #ifdef CONFIG_BPF_SYSCALL
964 .psock_update_sk_prot = unix_stream_bpf_update_proto,
968 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
974 atomic_long_inc(&unix_nr_socks);
975 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
980 if (type == SOCK_STREAM)
981 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
982 else /*dgram and seqpacket */
983 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
990 sock_init_data(sock, sk);
992 sk->sk_hash = unix_unbound_hash(sk);
993 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
994 sk->sk_write_space = unix_write_space;
995 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
996 sk->sk_destruct = unix_sock_destructor;
998 u->path.dentry = NULL;
1000 spin_lock_init(&u->lock);
1001 atomic_long_set(&u->inflight, 0);
1002 INIT_LIST_HEAD(&u->link);
1003 mutex_init(&u->iolock); /* single task reading lock */
1004 mutex_init(&u->bindlock); /* single task binding lock */
1005 init_waitqueue_head(&u->peer_wait);
1006 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1007 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1008 unix_insert_unbound_socket(net, sk);
1010 sock_prot_inuse_add(net, sk->sk_prot, 1);
1015 atomic_long_dec(&unix_nr_socks);
1016 return ERR_PTR(err);
1019 static int unix_create(struct net *net, struct socket *sock, int protocol,
1024 if (protocol && protocol != PF_UNIX)
1025 return -EPROTONOSUPPORT;
1027 sock->state = SS_UNCONNECTED;
1029 switch (sock->type) {
1031 sock->ops = &unix_stream_ops;
1034 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1038 sock->type = SOCK_DGRAM;
1041 sock->ops = &unix_dgram_ops;
1043 case SOCK_SEQPACKET:
1044 sock->ops = &unix_seqpacket_ops;
1047 return -ESOCKTNOSUPPORT;
1050 sk = unix_create1(net, sock, kern, sock->type);
1057 static int unix_release(struct socket *sock)
1059 struct sock *sk = sock->sk;
1064 sk->sk_prot->close(sk, 0);
1065 unix_release_sock(sk, 0);
1071 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1074 struct inode *inode;
1079 unix_mkname_bsd(sunaddr, addr_len);
1080 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1084 err = path_permission(&path, MAY_WRITE);
1088 err = -ECONNREFUSED;
1089 inode = d_backing_inode(path.dentry);
1090 if (!S_ISSOCK(inode->i_mode))
1093 sk = unix_find_socket_byinode(inode);
1098 if (sk->sk_type == type)
1112 return ERR_PTR(err);
1115 static struct sock *unix_find_abstract(struct net *net,
1116 struct sockaddr_un *sunaddr,
1117 int addr_len, int type)
1119 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1120 struct dentry *dentry;
1123 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1125 return ERR_PTR(-ECONNREFUSED);
1127 dentry = unix_sk(sk)->path.dentry;
1129 touch_atime(&unix_sk(sk)->path);
1134 static struct sock *unix_find_other(struct net *net,
1135 struct sockaddr_un *sunaddr,
1136 int addr_len, int type)
1140 if (sunaddr->sun_path[0])
1141 sk = unix_find_bsd(sunaddr, addr_len, type);
1143 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1148 static int unix_autobind(struct sock *sk)
1150 unsigned int new_hash, old_hash = sk->sk_hash;
1151 struct unix_sock *u = unix_sk(sk);
1152 struct net *net = sock_net(sk);
1153 struct unix_address *addr;
1154 u32 lastnum, ordernum;
1157 err = mutex_lock_interruptible(&u->bindlock);
1165 addr = kzalloc(sizeof(*addr) +
1166 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1170 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1171 addr->name->sun_family = AF_UNIX;
1172 refcount_set(&addr->refcnt, 1);
1174 ordernum = get_random_u32();
1175 lastnum = ordernum & 0xFFFFF;
1177 ordernum = (ordernum + 1) & 0xFFFFF;
1178 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1180 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1181 unix_table_double_lock(net, old_hash, new_hash);
1183 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1184 unix_table_double_unlock(net, old_hash, new_hash);
1186 /* __unix_find_socket_byname() may take long time if many names
1187 * are already in use.
1191 if (ordernum == lastnum) {
1192 /* Give up if all names seems to be in use. */
1194 unix_release_addr(addr);
1201 __unix_set_addr_hash(net, sk, addr, new_hash);
1202 unix_table_double_unlock(net, old_hash, new_hash);
1205 out: mutex_unlock(&u->bindlock);
1209 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1212 umode_t mode = S_IFSOCK |
1213 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1214 unsigned int new_hash, old_hash = sk->sk_hash;
1215 struct unix_sock *u = unix_sk(sk);
1216 struct net *net = sock_net(sk);
1217 struct mnt_idmap *idmap;
1218 struct unix_address *addr;
1219 struct dentry *dentry;
1223 addr_len = unix_mkname_bsd(sunaddr, addr_len);
1224 addr = unix_create_addr(sunaddr, addr_len);
1229 * Get the parent directory, calculate the hash for last
1232 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1233 if (IS_ERR(dentry)) {
1234 err = PTR_ERR(dentry);
1239 * All right, let's create it.
1241 idmap = mnt_idmap(parent.mnt);
1242 err = security_path_mknod(&parent, dentry, mode, 0);
1244 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1247 err = mutex_lock_interruptible(&u->bindlock);
1253 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1254 unix_table_double_lock(net, old_hash, new_hash);
1255 u->path.mnt = mntget(parent.mnt);
1256 u->path.dentry = dget(dentry);
1257 __unix_set_addr_hash(net, sk, addr, new_hash);
1258 unix_table_double_unlock(net, old_hash, new_hash);
1259 unix_insert_bsd_socket(sk);
1260 mutex_unlock(&u->bindlock);
1261 done_path_create(&parent, dentry);
1265 mutex_unlock(&u->bindlock);
1268 /* failed after successful mknod? unlink what we'd created... */
1269 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1271 done_path_create(&parent, dentry);
1273 unix_release_addr(addr);
1274 return err == -EEXIST ? -EADDRINUSE : err;
1277 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1280 unsigned int new_hash, old_hash = sk->sk_hash;
1281 struct unix_sock *u = unix_sk(sk);
1282 struct net *net = sock_net(sk);
1283 struct unix_address *addr;
1286 addr = unix_create_addr(sunaddr, addr_len);
1290 err = mutex_lock_interruptible(&u->bindlock);
1299 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1300 unix_table_double_lock(net, old_hash, new_hash);
1302 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1305 __unix_set_addr_hash(net, sk, addr, new_hash);
1306 unix_table_double_unlock(net, old_hash, new_hash);
1307 mutex_unlock(&u->bindlock);
1311 unix_table_double_unlock(net, old_hash, new_hash);
1314 mutex_unlock(&u->bindlock);
1316 unix_release_addr(addr);
1320 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1322 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1323 struct sock *sk = sock->sk;
1326 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1327 sunaddr->sun_family == AF_UNIX)
1328 return unix_autobind(sk);
1330 err = unix_validate_addr(sunaddr, addr_len);
1334 if (sunaddr->sun_path[0])
1335 err = unix_bind_bsd(sk, sunaddr, addr_len);
1337 err = unix_bind_abstract(sk, sunaddr, addr_len);
1342 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1344 if (unlikely(sk1 == sk2) || !sk2) {
1345 unix_state_lock(sk1);
1349 unix_state_lock(sk1);
1350 unix_state_lock_nested(sk2);
1352 unix_state_lock(sk2);
1353 unix_state_lock_nested(sk1);
1357 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1359 if (unlikely(sk1 == sk2) || !sk2) {
1360 unix_state_unlock(sk1);
1363 unix_state_unlock(sk1);
1364 unix_state_unlock(sk2);
1367 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1368 int alen, int flags)
1370 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1371 struct sock *sk = sock->sk;
1376 if (alen < offsetofend(struct sockaddr, sa_family))
1379 if (addr->sa_family != AF_UNSPEC) {
1380 err = unix_validate_addr(sunaddr, alen);
1384 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1385 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1386 !unix_sk(sk)->addr) {
1387 err = unix_autobind(sk);
1393 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1394 if (IS_ERR(other)) {
1395 err = PTR_ERR(other);
1399 unix_state_double_lock(sk, other);
1401 /* Apparently VFS overslept socket death. Retry. */
1402 if (sock_flag(other, SOCK_DEAD)) {
1403 unix_state_double_unlock(sk, other);
1409 if (!unix_may_send(sk, other))
1412 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1416 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1419 * 1003.1g breaking connected state with AF_UNSPEC
1422 unix_state_double_lock(sk, other);
1426 * If it was connected, reconnect.
1428 if (unix_peer(sk)) {
1429 struct sock *old_peer = unix_peer(sk);
1431 unix_peer(sk) = other;
1433 sk->sk_state = TCP_CLOSE;
1434 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1436 unix_state_double_unlock(sk, other);
1438 if (other != old_peer)
1439 unix_dgram_disconnected(sk, old_peer);
1442 unix_peer(sk) = other;
1443 unix_state_double_unlock(sk, other);
1449 unix_state_double_unlock(sk, other);
1455 static long unix_wait_for_peer(struct sock *other, long timeo)
1456 __releases(&unix_sk(other)->lock)
1458 struct unix_sock *u = unix_sk(other);
1462 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1464 sched = !sock_flag(other, SOCK_DEAD) &&
1465 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1466 unix_recvq_full_lockless(other);
1468 unix_state_unlock(other);
1471 timeo = schedule_timeout(timeo);
1473 finish_wait(&u->peer_wait, &wait);
1477 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1478 int addr_len, int flags)
1480 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1481 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1482 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1483 struct net *net = sock_net(sk);
1484 struct sk_buff *skb = NULL;
1489 err = unix_validate_addr(sunaddr, addr_len);
1493 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1495 err = unix_autobind(sk);
1500 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1502 /* First of all allocate resources.
1503 If we will make it after state is locked,
1504 we will have to recheck all again in any case.
1507 /* create new sock for complete connection */
1508 newsk = unix_create1(net, NULL, 0, sock->type);
1509 if (IS_ERR(newsk)) {
1510 err = PTR_ERR(newsk);
1517 /* Allocate skb for sending to listening sock */
1518 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1523 /* Find listening sock. */
1524 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1525 if (IS_ERR(other)) {
1526 err = PTR_ERR(other);
1531 /* Latch state of peer */
1532 unix_state_lock(other);
1534 /* Apparently VFS overslept socket death. Retry. */
1535 if (sock_flag(other, SOCK_DEAD)) {
1536 unix_state_unlock(other);
1541 err = -ECONNREFUSED;
1542 if (other->sk_state != TCP_LISTEN)
1544 if (other->sk_shutdown & RCV_SHUTDOWN)
1547 if (unix_recvq_full(other)) {
1552 timeo = unix_wait_for_peer(other, timeo);
1554 err = sock_intr_errno(timeo);
1555 if (signal_pending(current))
1563 It is tricky place. We need to grab our state lock and cannot
1564 drop lock on peer. It is dangerous because deadlock is
1565 possible. Connect to self case and simultaneous
1566 attempt to connect are eliminated by checking socket
1567 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1568 check this before attempt to grab lock.
1570 Well, and we have to recheck the state after socket locked.
1576 /* This is ok... continue with connect */
1578 case TCP_ESTABLISHED:
1579 /* Socket is already connected */
1587 unix_state_lock_nested(sk);
1589 if (sk->sk_state != st) {
1590 unix_state_unlock(sk);
1591 unix_state_unlock(other);
1596 err = security_unix_stream_connect(sk, other, newsk);
1598 unix_state_unlock(sk);
1602 /* The way is open! Fastly set all the necessary fields... */
1605 unix_peer(newsk) = sk;
1606 newsk->sk_state = TCP_ESTABLISHED;
1607 newsk->sk_type = sk->sk_type;
1608 init_peercred(newsk);
1609 newu = unix_sk(newsk);
1610 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1611 otheru = unix_sk(other);
1613 /* copy address information from listening to new sock
1615 * The contents of *(otheru->addr) and otheru->path
1616 * are seen fully set up here, since we have found
1617 * otheru in hash under its lock. Insertion into the
1618 * hash chain we'd found it in had been done in an
1619 * earlier critical area protected by the chain's lock,
1620 * the same one where we'd set *(otheru->addr) contents,
1621 * as well as otheru->path and otheru->addr itself.
1623 * Using smp_store_release() here to set newu->addr
1624 * is enough to make those stores, as well as stores
1625 * to newu->path visible to anyone who gets newu->addr
1626 * by smp_load_acquire(). IOW, the same warranties
1627 * as for unix_sock instances bound in unix_bind() or
1628 * in unix_autobind().
1630 if (otheru->path.dentry) {
1631 path_get(&otheru->path);
1632 newu->path = otheru->path;
1634 refcount_inc(&otheru->addr->refcnt);
1635 smp_store_release(&newu->addr, otheru->addr);
1637 /* Set credentials */
1638 copy_peercred(sk, other);
1640 sock->state = SS_CONNECTED;
1641 sk->sk_state = TCP_ESTABLISHED;
1644 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1645 unix_peer(sk) = newsk;
1647 unix_state_unlock(sk);
1649 /* take ten and send info to listening sock */
1650 spin_lock(&other->sk_receive_queue.lock);
1651 __skb_queue_tail(&other->sk_receive_queue, skb);
1652 spin_unlock(&other->sk_receive_queue.lock);
1653 unix_state_unlock(other);
1654 other->sk_data_ready(other);
1660 unix_state_unlock(other);
1665 unix_release_sock(newsk, 0);
1671 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1673 struct sock *ska = socka->sk, *skb = sockb->sk;
1675 /* Join our sockets back to back */
1678 unix_peer(ska) = skb;
1679 unix_peer(skb) = ska;
1683 ska->sk_state = TCP_ESTABLISHED;
1684 skb->sk_state = TCP_ESTABLISHED;
1685 socka->state = SS_CONNECTED;
1686 sockb->state = SS_CONNECTED;
1690 static void unix_sock_inherit_flags(const struct socket *old,
1693 if (test_bit(SOCK_PASSCRED, &old->flags))
1694 set_bit(SOCK_PASSCRED, &new->flags);
1695 if (test_bit(SOCK_PASSPIDFD, &old->flags))
1696 set_bit(SOCK_PASSPIDFD, &new->flags);
1697 if (test_bit(SOCK_PASSSEC, &old->flags))
1698 set_bit(SOCK_PASSSEC, &new->flags);
1701 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1704 struct sock *sk = sock->sk;
1706 struct sk_buff *skb;
1710 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1714 if (sk->sk_state != TCP_LISTEN)
1717 /* If socket state is TCP_LISTEN it cannot change (for now...),
1718 * so that no locks are necessary.
1721 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1724 /* This means receive shutdown. */
1731 skb_free_datagram(sk, skb);
1732 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1734 /* attach accepted sock to socket */
1735 unix_state_lock(tsk);
1736 newsock->state = SS_CONNECTED;
1737 unix_sock_inherit_flags(sock, newsock);
1738 sock_graft(tsk, newsock);
1739 unix_state_unlock(tsk);
1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1749 struct sock *sk = sock->sk;
1750 struct unix_address *addr;
1751 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1755 sk = unix_peer_get(sk);
1765 addr = smp_load_acquire(&unix_sk(sk)->addr);
1767 sunaddr->sun_family = AF_UNIX;
1768 sunaddr->sun_path[0] = 0;
1769 err = offsetof(struct sockaddr_un, sun_path);
1772 memcpy(sunaddr, addr->name, addr->len);
1779 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1781 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1784 * Garbage collection of unix sockets starts by selecting a set of
1785 * candidate sockets which have reference only from being in flight
1786 * (total_refs == inflight_refs). This condition is checked once during
1787 * the candidate collection phase, and candidates are marked as such, so
1788 * that non-candidates can later be ignored. While inflight_refs is
1789 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1790 * is an instantaneous decision.
1792 * Once a candidate, however, the socket must not be reinstalled into a
1793 * file descriptor while the garbage collection is in progress.
1795 * If the above conditions are met, then the directed graph of
1796 * candidates (*) does not change while unix_gc_lock is held.
1798 * Any operations that changes the file count through file descriptors
1799 * (dup, close, sendmsg) does not change the graph since candidates are
1800 * not installed in fds.
1802 * Dequeing a candidate via recvmsg would install it into an fd, but
1803 * that takes unix_gc_lock to decrement the inflight count, so it's
1804 * serialized with garbage collection.
1806 * MSG_PEEK is special in that it does not change the inflight count,
1807 * yet does install the socket into an fd. The following lock/unlock
1808 * pair is to ensure serialization with garbage collection. It must be
1809 * done between incrementing the file count and installing the file into
1812 * If garbage collection starts after the barrier provided by the
1813 * lock/unlock, then it will see the elevated refcount and not mark this
1814 * as a candidate. If a garbage collection is already in progress
1815 * before the file count was incremented, then the lock/unlock pair will
1816 * ensure that garbage collection is finished before progressing to
1817 * installing the fd.
1819 * (*) A -> B where B is on the queue of A or B is on the queue of C
1820 * which is on the queue of listening socket A.
1822 spin_lock(&unix_gc_lock);
1823 spin_unlock(&unix_gc_lock);
1826 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1830 UNIXCB(skb).pid = get_pid(scm->pid);
1831 UNIXCB(skb).uid = scm->creds.uid;
1832 UNIXCB(skb).gid = scm->creds.gid;
1833 UNIXCB(skb).fp = NULL;
1834 unix_get_secdata(scm, skb);
1835 if (scm->fp && send_fds)
1836 err = unix_attach_fds(scm, skb);
1838 skb->destructor = unix_destruct_scm;
1842 static bool unix_passcred_enabled(const struct socket *sock,
1843 const struct sock *other)
1845 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1846 test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1847 !other->sk_socket ||
1848 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1849 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1853 * Some apps rely on write() giving SCM_CREDENTIALS
1854 * We include credentials if source or destination socket
1855 * asserted SOCK_PASSCRED.
1857 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1858 const struct sock *other)
1860 if (UNIXCB(skb).pid)
1862 if (unix_passcred_enabled(sock, other)) {
1863 UNIXCB(skb).pid = get_pid(task_tgid(current));
1864 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1868 static bool unix_skb_scm_eq(struct sk_buff *skb,
1869 struct scm_cookie *scm)
1871 return UNIXCB(skb).pid == scm->pid &&
1872 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1873 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1874 unix_secdata_eq(scm, skb);
1877 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1879 struct scm_fp_list *fp = UNIXCB(skb).fp;
1880 struct unix_sock *u = unix_sk(sk);
1882 if (unlikely(fp && fp->count))
1883 atomic_add(fp->count, &u->scm_stat.nr_fds);
1886 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1888 struct scm_fp_list *fp = UNIXCB(skb).fp;
1889 struct unix_sock *u = unix_sk(sk);
1891 if (unlikely(fp && fp->count))
1892 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1896 * Send AF_UNIX data.
1899 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1902 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1903 struct sock *sk = sock->sk, *other = NULL;
1904 struct unix_sock *u = unix_sk(sk);
1905 struct scm_cookie scm;
1906 struct sk_buff *skb;
1913 err = scm_send(sock, msg, &scm, false);
1918 if (msg->msg_flags&MSG_OOB)
1921 if (msg->msg_namelen) {
1922 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1928 other = unix_peer_get(sk);
1933 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1934 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1935 err = unix_autobind(sk);
1941 if (len > sk->sk_sndbuf - 32)
1944 if (len > SKB_MAX_ALLOC) {
1945 data_len = min_t(size_t,
1946 len - SKB_MAX_ALLOC,
1947 MAX_SKB_FRAGS * PAGE_SIZE);
1948 data_len = PAGE_ALIGN(data_len);
1950 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1953 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1954 msg->msg_flags & MSG_DONTWAIT, &err,
1955 PAGE_ALLOC_COSTLY_ORDER);
1959 err = unix_scm_to_skb(&scm, skb, true);
1963 skb_put(skb, len - data_len);
1964 skb->data_len = data_len;
1966 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1970 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1975 if (sunaddr == NULL)
1978 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1980 if (IS_ERR(other)) {
1981 err = PTR_ERR(other);
1987 if (sk_filter(other, skb) < 0) {
1988 /* Toss the packet but do not return any error to the sender */
1994 unix_state_lock(other);
1997 if (!unix_may_send(sk, other))
2000 if (unlikely(sock_flag(other, SOCK_DEAD))) {
2002 * Check with 1003.1g - what should
2005 unix_state_unlock(other);
2009 unix_state_lock(sk);
2012 if (sk->sk_type == SOCK_SEQPACKET) {
2013 /* We are here only when racing with unix_release_sock()
2014 * is clearing @other. Never change state to TCP_CLOSE
2015 * unlike SOCK_DGRAM wants.
2017 unix_state_unlock(sk);
2019 } else if (unix_peer(sk) == other) {
2020 unix_peer(sk) = NULL;
2021 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2023 sk->sk_state = TCP_CLOSE;
2024 unix_state_unlock(sk);
2026 unix_dgram_disconnected(sk, other);
2028 err = -ECONNREFUSED;
2030 unix_state_unlock(sk);
2040 if (other->sk_shutdown & RCV_SHUTDOWN)
2043 if (sk->sk_type != SOCK_SEQPACKET) {
2044 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2049 /* other == sk && unix_peer(other) != sk if
2050 * - unix_peer(sk) == NULL, destination address bound to sk
2051 * - unix_peer(sk) == sk by time of get but disconnected before lock
2054 unlikely(unix_peer(other) != sk &&
2055 unix_recvq_full_lockless(other))) {
2057 timeo = unix_wait_for_peer(other, timeo);
2059 err = sock_intr_errno(timeo);
2060 if (signal_pending(current))
2067 unix_state_unlock(other);
2068 unix_state_double_lock(sk, other);
2071 if (unix_peer(sk) != other ||
2072 unix_dgram_peer_wake_me(sk, other)) {
2080 goto restart_locked;
2084 if (unlikely(sk_locked))
2085 unix_state_unlock(sk);
2087 if (sock_flag(other, SOCK_RCVTSTAMP))
2088 __net_timestamp(skb);
2089 maybe_add_creds(skb, sock, other);
2090 scm_stat_add(other, skb);
2091 skb_queue_tail(&other->sk_receive_queue, skb);
2092 unix_state_unlock(other);
2093 other->sk_data_ready(other);
2100 unix_state_unlock(sk);
2101 unix_state_unlock(other);
2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2112 * bytes, and a minimum of a full page.
2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2118 struct scm_cookie *scm, bool fds_sent)
2120 struct unix_sock *ousk = unix_sk(other);
2121 struct sk_buff *skb;
2124 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2129 err = unix_scm_to_skb(scm, skb, !fds_sent);
2135 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2142 unix_state_lock(other);
2144 if (sock_flag(other, SOCK_DEAD) ||
2145 (other->sk_shutdown & RCV_SHUTDOWN)) {
2146 unix_state_unlock(other);
2151 maybe_add_creds(skb, sock, other);
2155 consume_skb(ousk->oob_skb);
2157 WRITE_ONCE(ousk->oob_skb, skb);
2159 scm_stat_add(other, skb);
2160 skb_queue_tail(&other->sk_receive_queue, skb);
2161 sk_send_sigurg(other);
2162 unix_state_unlock(other);
2163 other->sk_data_ready(other);
2169 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2172 struct sock *sk = sock->sk;
2173 struct sock *other = NULL;
2175 struct sk_buff *skb;
2177 struct scm_cookie scm;
2178 bool fds_sent = false;
2182 err = scm_send(sock, msg, &scm, false);
2187 if (msg->msg_flags & MSG_OOB) {
2188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2196 if (msg->msg_namelen) {
2197 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2201 other = unix_peer(sk);
2206 if (sk->sk_shutdown & SEND_SHUTDOWN)
2209 while (sent < len) {
2212 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2213 skb = sock_alloc_send_pskb(sk, 0, 0,
2214 msg->msg_flags & MSG_DONTWAIT,
2217 /* Keep two messages in the pipe so it schedules better */
2218 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2220 /* allow fallback to order-0 allocations */
2221 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2223 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2225 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2227 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2228 msg->msg_flags & MSG_DONTWAIT, &err,
2229 get_order(UNIX_SKB_FRAGS_SZ));
2234 /* Only send the fds in the first buffer */
2235 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2242 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2243 err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2250 refcount_add(size, &sk->sk_wmem_alloc);
2252 skb_put(skb, size - data_len);
2253 skb->data_len = data_len;
2255 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2262 unix_state_lock(other);
2264 if (sock_flag(other, SOCK_DEAD) ||
2265 (other->sk_shutdown & RCV_SHUTDOWN))
2268 maybe_add_creds(skb, sock, other);
2269 scm_stat_add(other, skb);
2270 skb_queue_tail(&other->sk_receive_queue, skb);
2271 unix_state_unlock(other);
2272 other->sk_data_ready(other);
2276 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2277 if (msg->msg_flags & MSG_OOB) {
2278 err = queue_oob(sock, msg, other, &scm, fds_sent);
2290 unix_state_unlock(other);
2293 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2294 send_sig(SIGPIPE, current, 0);
2298 return sent ? : err;
2301 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2305 struct sock *sk = sock->sk;
2307 err = sock_error(sk);
2311 if (sk->sk_state != TCP_ESTABLISHED)
2314 if (msg->msg_namelen)
2315 msg->msg_namelen = 0;
2317 return unix_dgram_sendmsg(sock, msg, len);
2320 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2321 size_t size, int flags)
2323 struct sock *sk = sock->sk;
2325 if (sk->sk_state != TCP_ESTABLISHED)
2328 return unix_dgram_recvmsg(sock, msg, size, flags);
2331 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2333 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2336 msg->msg_namelen = addr->len;
2337 memcpy(msg->msg_name, addr->name, addr->len);
2341 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2344 struct scm_cookie scm;
2345 struct socket *sock = sk->sk_socket;
2346 struct unix_sock *u = unix_sk(sk);
2347 struct sk_buff *skb, *last;
2356 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2359 mutex_lock(&u->iolock);
2361 skip = sk_peek_offset(sk, flags);
2362 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2363 &skip, &err, &last);
2365 if (!(flags & MSG_PEEK))
2366 scm_stat_del(sk, skb);
2370 mutex_unlock(&u->iolock);
2375 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2376 &err, &timeo, last));
2378 if (!skb) { /* implies iolock unlocked */
2379 unix_state_lock(sk);
2380 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2381 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2382 (sk->sk_shutdown & RCV_SHUTDOWN))
2384 unix_state_unlock(sk);
2388 if (wq_has_sleeper(&u->peer_wait))
2389 wake_up_interruptible_sync_poll(&u->peer_wait,
2390 EPOLLOUT | EPOLLWRNORM |
2394 unix_copy_addr(msg, skb->sk);
2396 if (size > skb->len - skip)
2397 size = skb->len - skip;
2398 else if (size < skb->len - skip)
2399 msg->msg_flags |= MSG_TRUNC;
2401 err = skb_copy_datagram_msg(skb, skip, msg, size);
2405 if (sock_flag(sk, SOCK_RCVTSTAMP))
2406 __sock_recv_timestamp(msg, sk, skb);
2408 memset(&scm, 0, sizeof(scm));
2410 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2411 unix_set_secdata(&scm, skb);
2413 if (!(flags & MSG_PEEK)) {
2415 unix_detach_fds(&scm, skb);
2417 sk_peek_offset_bwd(sk, skb->len);
2419 /* It is questionable: on PEEK we could:
2420 - do not return fds - good, but too simple 8)
2421 - return fds, and do not return them on read (old strategy,
2423 - clone fds (I chose it for now, it is the most universal
2426 POSIX 1003.1g does not actually define this clearly
2427 at all. POSIX 1003.1g doesn't define a lot of things
2432 sk_peek_offset_fwd(sk, size);
2435 unix_peek_fds(&scm, skb);
2437 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2439 scm_recv_unix(sock, msg, &scm, flags);
2442 skb_free_datagram(sk, skb);
2443 mutex_unlock(&u->iolock);
2448 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2451 struct sock *sk = sock->sk;
2453 #ifdef CONFIG_BPF_SYSCALL
2454 const struct proto *prot = READ_ONCE(sk->sk_prot);
2456 if (prot != &unix_dgram_proto)
2457 return prot->recvmsg(sk, msg, size, flags, NULL);
2459 return __unix_dgram_recvmsg(sk, msg, size, flags);
2462 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2464 struct unix_sock *u = unix_sk(sk);
2465 struct sk_buff *skb;
2468 mutex_lock(&u->iolock);
2469 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2470 mutex_unlock(&u->iolock);
2474 return recv_actor(sk, skb);
2478 * Sleep until more data has arrived. But check for races..
2480 static long unix_stream_data_wait(struct sock *sk, long timeo,
2481 struct sk_buff *last, unsigned int last_len,
2484 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2485 struct sk_buff *tail;
2488 unix_state_lock(sk);
2491 prepare_to_wait(sk_sleep(sk), &wait, state);
2493 tail = skb_peek_tail(&sk->sk_receive_queue);
2495 (tail && tail->len != last_len) ||
2497 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2498 signal_pending(current) ||
2502 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2503 unix_state_unlock(sk);
2504 timeo = schedule_timeout(timeo);
2505 unix_state_lock(sk);
2507 if (sock_flag(sk, SOCK_DEAD))
2510 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2513 finish_wait(sk_sleep(sk), &wait);
2514 unix_state_unlock(sk);
2518 static unsigned int unix_skb_len(const struct sk_buff *skb)
2520 return skb->len - UNIXCB(skb).consumed;
2523 struct unix_stream_read_state {
2524 int (*recv_actor)(struct sk_buff *, int, int,
2525 struct unix_stream_read_state *);
2526 struct socket *socket;
2528 struct pipe_inode_info *pipe;
2531 unsigned int splice_flags;
2534 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2535 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2537 struct socket *sock = state->socket;
2538 struct sock *sk = sock->sk;
2539 struct unix_sock *u = unix_sk(sk);
2541 struct sk_buff *oob_skb;
2543 mutex_lock(&u->iolock);
2544 unix_state_lock(sk);
2546 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2547 unix_state_unlock(sk);
2548 mutex_unlock(&u->iolock);
2552 oob_skb = u->oob_skb;
2554 if (!(state->flags & MSG_PEEK))
2555 WRITE_ONCE(u->oob_skb, NULL);
2557 unix_state_unlock(sk);
2559 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2561 if (!(state->flags & MSG_PEEK)) {
2562 UNIXCB(oob_skb).consumed += 1;
2566 mutex_unlock(&u->iolock);
2571 state->msg->msg_flags |= MSG_OOB;
2575 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2576 int flags, int copied)
2578 struct unix_sock *u = unix_sk(sk);
2580 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2581 skb_unlink(skb, &sk->sk_receive_queue);
2585 if (skb == u->oob_skb) {
2588 } else if (sock_flag(sk, SOCK_URGINLINE)) {
2589 if (!(flags & MSG_PEEK)) {
2590 WRITE_ONCE(u->oob_skb, NULL);
2593 } else if (!(flags & MSG_PEEK)) {
2594 skb_unlink(skb, &sk->sk_receive_queue);
2596 skb = skb_peek(&sk->sk_receive_queue);
2604 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2606 if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2609 return unix_read_skb(sk, recv_actor);
2612 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2615 struct scm_cookie scm;
2616 struct socket *sock = state->socket;
2617 struct sock *sk = sock->sk;
2618 struct unix_sock *u = unix_sk(sk);
2620 int flags = state->flags;
2621 int noblock = flags & MSG_DONTWAIT;
2622 bool check_creds = false;
2627 size_t size = state->size;
2628 unsigned int last_len;
2630 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2635 if (unlikely(flags & MSG_OOB)) {
2637 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2638 err = unix_stream_recv_urg(state);
2643 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2644 timeo = sock_rcvtimeo(sk, noblock);
2646 memset(&scm, 0, sizeof(scm));
2648 /* Lock the socket to prevent queue disordering
2649 * while sleeps in memcpy_tomsg
2651 mutex_lock(&u->iolock);
2653 skip = max(sk_peek_offset(sk, flags), 0);
2658 struct sk_buff *skb, *last;
2661 unix_state_lock(sk);
2662 if (sock_flag(sk, SOCK_DEAD)) {
2666 last = skb = skb_peek(&sk->sk_receive_queue);
2667 last_len = last ? last->len : 0;
2669 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2671 skb = manage_oob(skb, sk, flags, copied);
2673 unix_state_unlock(sk);
2682 if (copied >= target)
2686 * POSIX 1003.1g mandates this order.
2689 err = sock_error(sk);
2692 if (sk->sk_shutdown & RCV_SHUTDOWN)
2695 unix_state_unlock(sk);
2701 mutex_unlock(&u->iolock);
2703 timeo = unix_stream_data_wait(sk, timeo, last,
2704 last_len, freezable);
2706 if (signal_pending(current)) {
2707 err = sock_intr_errno(timeo);
2712 mutex_lock(&u->iolock);
2715 unix_state_unlock(sk);
2719 while (skip >= unix_skb_len(skb)) {
2720 skip -= unix_skb_len(skb);
2722 last_len = skb->len;
2723 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2728 unix_state_unlock(sk);
2731 /* Never glue messages from different writers */
2732 if (!unix_skb_scm_eq(skb, &scm))
2734 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2735 test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2736 /* Copy credentials */
2737 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2738 unix_set_secdata(&scm, skb);
2742 /* Copy address just once */
2743 if (state->msg && state->msg->msg_name) {
2744 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2745 state->msg->msg_name);
2746 unix_copy_addr(state->msg, skb->sk);
2750 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2752 chunk = state->recv_actor(skb, skip, chunk, state);
2753 drop_skb = !unix_skb_len(skb);
2754 /* skb is only safe to use if !drop_skb */
2765 /* the skb was touched by a concurrent reader;
2766 * we should not expect anything from this skb
2767 * anymore and assume it invalid - we can be
2768 * sure it was dropped from the socket queue
2770 * let's report a short read
2776 /* Mark read part of skb as used */
2777 if (!(flags & MSG_PEEK)) {
2778 UNIXCB(skb).consumed += chunk;
2780 sk_peek_offset_bwd(sk, chunk);
2782 if (UNIXCB(skb).fp) {
2783 scm_stat_del(sk, skb);
2784 unix_detach_fds(&scm, skb);
2787 if (unix_skb_len(skb))
2790 skb_unlink(skb, &sk->sk_receive_queue);
2796 /* It is questionable, see note in unix_dgram_recvmsg.
2799 unix_peek_fds(&scm, skb);
2801 sk_peek_offset_fwd(sk, chunk);
2808 last_len = skb->len;
2809 unix_state_lock(sk);
2810 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2813 unix_state_unlock(sk);
2818 mutex_unlock(&u->iolock);
2820 scm_recv_unix(sock, state->msg, &scm, flags);
2824 return copied ? : err;
2827 static int unix_stream_read_actor(struct sk_buff *skb,
2828 int skip, int chunk,
2829 struct unix_stream_read_state *state)
2833 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2835 return ret ?: chunk;
2838 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2839 size_t size, int flags)
2841 struct unix_stream_read_state state = {
2842 .recv_actor = unix_stream_read_actor,
2843 .socket = sk->sk_socket,
2849 return unix_stream_read_generic(&state, true);
2852 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2853 size_t size, int flags)
2855 struct unix_stream_read_state state = {
2856 .recv_actor = unix_stream_read_actor,
2863 #ifdef CONFIG_BPF_SYSCALL
2864 struct sock *sk = sock->sk;
2865 const struct proto *prot = READ_ONCE(sk->sk_prot);
2867 if (prot != &unix_stream_proto)
2868 return prot->recvmsg(sk, msg, size, flags, NULL);
2870 return unix_stream_read_generic(&state, true);
2873 static int unix_stream_splice_actor(struct sk_buff *skb,
2874 int skip, int chunk,
2875 struct unix_stream_read_state *state)
2877 return skb_splice_bits(skb, state->socket->sk,
2878 UNIXCB(skb).consumed + skip,
2879 state->pipe, chunk, state->splice_flags);
2882 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2883 struct pipe_inode_info *pipe,
2884 size_t size, unsigned int flags)
2886 struct unix_stream_read_state state = {
2887 .recv_actor = unix_stream_splice_actor,
2891 .splice_flags = flags,
2894 if (unlikely(*ppos))
2897 if (sock->file->f_flags & O_NONBLOCK ||
2898 flags & SPLICE_F_NONBLOCK)
2899 state.flags = MSG_DONTWAIT;
2901 return unix_stream_read_generic(&state, false);
2904 static int unix_shutdown(struct socket *sock, int mode)
2906 struct sock *sk = sock->sk;
2909 if (mode < SHUT_RD || mode > SHUT_RDWR)
2912 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2913 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2914 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2918 unix_state_lock(sk);
2919 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2920 other = unix_peer(sk);
2923 unix_state_unlock(sk);
2924 sk->sk_state_change(sk);
2927 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2930 const struct proto *prot = READ_ONCE(other->sk_prot);
2933 prot->unhash(other);
2934 if (mode&RCV_SHUTDOWN)
2935 peer_mode |= SEND_SHUTDOWN;
2936 if (mode&SEND_SHUTDOWN)
2937 peer_mode |= RCV_SHUTDOWN;
2938 unix_state_lock(other);
2939 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2940 unix_state_unlock(other);
2941 other->sk_state_change(other);
2942 if (peer_mode == SHUTDOWN_MASK)
2943 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2944 else if (peer_mode & RCV_SHUTDOWN)
2945 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2953 long unix_inq_len(struct sock *sk)
2955 struct sk_buff *skb;
2958 if (sk->sk_state == TCP_LISTEN)
2961 spin_lock(&sk->sk_receive_queue.lock);
2962 if (sk->sk_type == SOCK_STREAM ||
2963 sk->sk_type == SOCK_SEQPACKET) {
2964 skb_queue_walk(&sk->sk_receive_queue, skb)
2965 amount += unix_skb_len(skb);
2967 skb = skb_peek(&sk->sk_receive_queue);
2971 spin_unlock(&sk->sk_receive_queue.lock);
2975 EXPORT_SYMBOL_GPL(unix_inq_len);
2977 long unix_outq_len(struct sock *sk)
2979 return sk_wmem_alloc_get(sk);
2981 EXPORT_SYMBOL_GPL(unix_outq_len);
2983 static int unix_open_file(struct sock *sk)
2989 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2992 if (!smp_load_acquire(&unix_sk(sk)->addr))
2995 path = unix_sk(sk)->path;
3001 fd = get_unused_fd_flags(O_CLOEXEC);
3005 f = dentry_open(&path, O_PATH, current_cred());
3019 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3021 struct sock *sk = sock->sk;
3027 amount = unix_outq_len(sk);
3028 err = put_user(amount, (int __user *)arg);
3031 amount = unix_inq_len(sk);
3035 err = put_user(amount, (int __user *)arg);
3038 err = unix_open_file(sk);
3040 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3043 struct sk_buff *skb;
3046 skb = skb_peek(&sk->sk_receive_queue);
3047 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3049 err = put_user(answ, (int __user *)arg);
3060 #ifdef CONFIG_COMPAT
3061 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3063 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3067 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3069 struct sock *sk = sock->sk;
3073 sock_poll_wait(file, sock, wait);
3075 shutdown = READ_ONCE(sk->sk_shutdown);
3077 /* exceptional events? */
3078 if (READ_ONCE(sk->sk_err))
3080 if (shutdown == SHUTDOWN_MASK)
3082 if (shutdown & RCV_SHUTDOWN)
3083 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3086 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3087 mask |= EPOLLIN | EPOLLRDNORM;
3088 if (sk_is_readable(sk))
3089 mask |= EPOLLIN | EPOLLRDNORM;
3090 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3091 if (READ_ONCE(unix_sk(sk)->oob_skb))
3095 /* Connection-based need to check for termination and startup */
3096 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3097 sk->sk_state == TCP_CLOSE)
3101 * we set writable also when the other side has shut down the
3102 * connection. This prevents stuck sockets.
3104 if (unix_writable(sk))
3105 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3110 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3113 struct sock *sk = sock->sk, *other;
3114 unsigned int writable;
3118 sock_poll_wait(file, sock, wait);
3120 shutdown = READ_ONCE(sk->sk_shutdown);
3122 /* exceptional events? */
3123 if (READ_ONCE(sk->sk_err) ||
3124 !skb_queue_empty_lockless(&sk->sk_error_queue))
3126 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3128 if (shutdown & RCV_SHUTDOWN)
3129 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3130 if (shutdown == SHUTDOWN_MASK)
3134 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3135 mask |= EPOLLIN | EPOLLRDNORM;
3136 if (sk_is_readable(sk))
3137 mask |= EPOLLIN | EPOLLRDNORM;
3139 /* Connection-based need to check for termination and startup */
3140 if (sk->sk_type == SOCK_SEQPACKET) {
3141 if (sk->sk_state == TCP_CLOSE)
3143 /* connection hasn't started yet? */
3144 if (sk->sk_state == TCP_SYN_SENT)
3148 /* No write status requested, avoid expensive OUT tests. */
3149 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3152 writable = unix_writable(sk);
3154 unix_state_lock(sk);
3156 other = unix_peer(sk);
3157 if (other && unix_peer(other) != sk &&
3158 unix_recvq_full_lockless(other) &&
3159 unix_dgram_peer_wake_me(sk, other))
3162 unix_state_unlock(sk);
3166 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3168 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3173 #ifdef CONFIG_PROC_FS
3175 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3177 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3178 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3179 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3181 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3183 unsigned long offset = get_offset(*pos);
3184 unsigned long bucket = get_bucket(*pos);
3185 unsigned long count = 0;
3188 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3189 sk; sk = sk_next(sk)) {
3190 if (++count == offset)
3197 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3199 unsigned long bucket = get_bucket(*pos);
3200 struct net *net = seq_file_net(seq);
3203 while (bucket < UNIX_HASH_SIZE) {
3204 spin_lock(&net->unx.table.locks[bucket]);
3206 sk = unix_from_bucket(seq, pos);
3210 spin_unlock(&net->unx.table.locks[bucket]);
3212 *pos = set_bucket_offset(++bucket, 1);
3218 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3221 unsigned long bucket = get_bucket(*pos);
3228 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3230 *pos = set_bucket_offset(++bucket, 1);
3232 return unix_get_first(seq, pos);
3235 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3238 return SEQ_START_TOKEN;
3240 return unix_get_first(seq, pos);
3243 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3247 if (v == SEQ_START_TOKEN)
3248 return unix_get_first(seq, pos);
3250 return unix_get_next(seq, v, pos);
3253 static void unix_seq_stop(struct seq_file *seq, void *v)
3255 struct sock *sk = v;
3258 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3261 static int unix_seq_show(struct seq_file *seq, void *v)
3264 if (v == SEQ_START_TOKEN)
3265 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3269 struct unix_sock *u = unix_sk(s);
3272 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3274 refcount_read(&s->sk_refcnt),
3276 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3279 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3280 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3283 if (u->addr) { // under a hash table lock here
3288 len = u->addr->len -
3289 offsetof(struct sockaddr_un, sun_path);
3290 if (u->addr->name->sun_path[0]) {
3296 for ( ; i < len; i++)
3297 seq_putc(seq, u->addr->name->sun_path[i] ?:
3300 unix_state_unlock(s);
3301 seq_putc(seq, '\n');
3307 static const struct seq_operations unix_seq_ops = {
3308 .start = unix_seq_start,
3309 .next = unix_seq_next,
3310 .stop = unix_seq_stop,
3311 .show = unix_seq_show,
3314 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3315 struct bpf_unix_iter_state {
3316 struct seq_net_private p;
3317 unsigned int cur_sk;
3318 unsigned int end_sk;
3319 unsigned int max_sk;
3320 struct sock **batch;
3321 bool st_bucket_done;
3324 struct bpf_iter__unix {
3325 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3326 __bpf_md_ptr(struct unix_sock *, unix_sk);
3327 uid_t uid __aligned(8);
3330 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3331 struct unix_sock *unix_sk, uid_t uid)
3333 struct bpf_iter__unix ctx;
3335 meta->seq_num--; /* skip SEQ_START_TOKEN */
3337 ctx.unix_sk = unix_sk;
3339 return bpf_iter_run_prog(prog, &ctx);
3342 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3345 struct bpf_unix_iter_state *iter = seq->private;
3346 unsigned int expected = 1;
3349 sock_hold(start_sk);
3350 iter->batch[iter->end_sk++] = start_sk;
3352 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3353 if (iter->end_sk < iter->max_sk) {
3355 iter->batch[iter->end_sk++] = sk;
3361 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3366 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3368 while (iter->cur_sk < iter->end_sk)
3369 sock_put(iter->batch[iter->cur_sk++]);
3372 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3373 unsigned int new_batch_sz)
3375 struct sock **new_batch;
3377 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3378 GFP_USER | __GFP_NOWARN);
3382 bpf_iter_unix_put_batch(iter);
3383 kvfree(iter->batch);
3384 iter->batch = new_batch;
3385 iter->max_sk = new_batch_sz;
3390 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3393 struct bpf_unix_iter_state *iter = seq->private;
3394 unsigned int expected;
3395 bool resized = false;
3398 if (iter->st_bucket_done)
3399 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3402 /* Get a new batch */
3406 sk = unix_get_first(seq, pos);
3408 return NULL; /* Done */
3410 expected = bpf_iter_unix_hold_batch(seq, sk);
3412 if (iter->end_sk == expected) {
3413 iter->st_bucket_done = true;
3417 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3425 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3428 return SEQ_START_TOKEN;
3430 /* bpf iter does not support lseek, so it always
3431 * continue from where it was stop()-ped.
3433 return bpf_iter_unix_batch(seq, pos);
3436 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3438 struct bpf_unix_iter_state *iter = seq->private;
3441 /* Whenever seq_next() is called, the iter->cur_sk is
3442 * done with seq_show(), so advance to the next sk in
3445 if (iter->cur_sk < iter->end_sk)
3446 sock_put(iter->batch[iter->cur_sk++]);
3450 if (iter->cur_sk < iter->end_sk)
3451 sk = iter->batch[iter->cur_sk];
3453 sk = bpf_iter_unix_batch(seq, pos);
3458 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3460 struct bpf_iter_meta meta;
3461 struct bpf_prog *prog;
3462 struct sock *sk = v;
3467 if (v == SEQ_START_TOKEN)
3470 slow = lock_sock_fast(sk);
3472 if (unlikely(sk_unhashed(sk))) {
3477 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3479 prog = bpf_iter_get_info(&meta, false);
3480 ret = unix_prog_seq_show(prog, &meta, v, uid);
3482 unlock_sock_fast(sk, slow);
3486 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3488 struct bpf_unix_iter_state *iter = seq->private;
3489 struct bpf_iter_meta meta;
3490 struct bpf_prog *prog;
3494 prog = bpf_iter_get_info(&meta, true);
3496 (void)unix_prog_seq_show(prog, &meta, v, 0);
3499 if (iter->cur_sk < iter->end_sk)
3500 bpf_iter_unix_put_batch(iter);
3503 static const struct seq_operations bpf_iter_unix_seq_ops = {
3504 .start = bpf_iter_unix_seq_start,
3505 .next = bpf_iter_unix_seq_next,
3506 .stop = bpf_iter_unix_seq_stop,
3507 .show = bpf_iter_unix_seq_show,
3512 static const struct net_proto_family unix_family_ops = {
3514 .create = unix_create,
3515 .owner = THIS_MODULE,
3519 static int __net_init unix_net_init(struct net *net)
3523 net->unx.sysctl_max_dgram_qlen = 10;
3524 if (unix_sysctl_register(net))
3527 #ifdef CONFIG_PROC_FS
3528 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3529 sizeof(struct seq_net_private)))
3533 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3534 sizeof(spinlock_t), GFP_KERNEL);
3535 if (!net->unx.table.locks)
3538 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3539 sizeof(struct hlist_head),
3541 if (!net->unx.table.buckets)
3544 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3545 spin_lock_init(&net->unx.table.locks[i]);
3546 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3552 kvfree(net->unx.table.locks);
3554 #ifdef CONFIG_PROC_FS
3555 remove_proc_entry("unix", net->proc_net);
3558 unix_sysctl_unregister(net);
3563 static void __net_exit unix_net_exit(struct net *net)
3565 kvfree(net->unx.table.buckets);
3566 kvfree(net->unx.table.locks);
3567 unix_sysctl_unregister(net);
3568 remove_proc_entry("unix", net->proc_net);
3571 static struct pernet_operations unix_net_ops = {
3572 .init = unix_net_init,
3573 .exit = unix_net_exit,
3576 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3577 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3578 struct unix_sock *unix_sk, uid_t uid)
3580 #define INIT_BATCH_SZ 16
3582 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3584 struct bpf_unix_iter_state *iter = priv_data;
3587 err = bpf_iter_init_seq_net(priv_data, aux);
3591 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3593 bpf_iter_fini_seq_net(priv_data);
3600 static void bpf_iter_fini_unix(void *priv_data)
3602 struct bpf_unix_iter_state *iter = priv_data;
3604 bpf_iter_fini_seq_net(priv_data);
3605 kvfree(iter->batch);
3608 static const struct bpf_iter_seq_info unix_seq_info = {
3609 .seq_ops = &bpf_iter_unix_seq_ops,
3610 .init_seq_private = bpf_iter_init_unix,
3611 .fini_seq_private = bpf_iter_fini_unix,
3612 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3615 static const struct bpf_func_proto *
3616 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3617 const struct bpf_prog *prog)
3620 case BPF_FUNC_setsockopt:
3621 return &bpf_sk_setsockopt_proto;
3622 case BPF_FUNC_getsockopt:
3623 return &bpf_sk_getsockopt_proto;
3629 static struct bpf_iter_reg unix_reg_info = {
3631 .ctx_arg_info_size = 1,
3633 { offsetof(struct bpf_iter__unix, unix_sk),
3634 PTR_TO_BTF_ID_OR_NULL },
3636 .get_func_proto = bpf_iter_unix_get_func_proto,
3637 .seq_info = &unix_seq_info,
3640 static void __init bpf_iter_register(void)
3642 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3643 if (bpf_iter_reg_target(&unix_reg_info))
3644 pr_warn("Warning: could not register bpf iterator unix\n");
3648 static int __init af_unix_init(void)
3652 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3654 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3655 spin_lock_init(&bsd_socket_locks[i]);
3656 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3659 rc = proto_register(&unix_dgram_proto, 1);
3661 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3665 rc = proto_register(&unix_stream_proto, 1);
3667 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3668 proto_unregister(&unix_dgram_proto);
3672 sock_register(&unix_family_ops);
3673 register_pernet_subsys(&unix_net_ops);
3674 unix_bpf_build_proto();
3676 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3677 bpf_iter_register();
3684 static void __exit af_unix_exit(void)
3686 sock_unregister(PF_UNIX);
3687 proto_unregister(&unix_dgram_proto);
3688 proto_unregister(&unix_stream_proto);
3689 unregister_pernet_subsys(&unix_net_ops);
3692 /* Earlier than device_initcall() so that other drivers invoking
3693 request_module() don't end up in a loop when modprobe tries
3694 to use a UNIX socket. But later than subsys_initcall() because
3695 we depend on stuff initialised there */
3696 fs_initcall(af_unix_init);
3697 module_exit(af_unix_exit);
3699 MODULE_LICENSE("GPL");
3700 MODULE_ALIAS_NETPROTO(PF_UNIX);