Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid
[platform/kernel/linux-starfive.git] / net / xdp / xsk.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *            Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock_drv.h>
26 #include <net/busy_poll.h>
27 #include <net/xdp.h>
28
29 #include "xsk_queue.h"
30 #include "xdp_umem.h"
31 #include "xsk.h"
32
33 #define TX_BATCH_SIZE 32
34
35 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
36
37 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
38 {
39         if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
40                 return;
41
42         pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
43         pool->cached_need_wakeup |= XDP_WAKEUP_RX;
44 }
45 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
46
47 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
48 {
49         struct xdp_sock *xs;
50
51         if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
52                 return;
53
54         rcu_read_lock();
55         list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
56                 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
57         }
58         rcu_read_unlock();
59
60         pool->cached_need_wakeup |= XDP_WAKEUP_TX;
61 }
62 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
63
64 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
65 {
66         if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
67                 return;
68
69         pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
70         pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
71 }
72 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
73
74 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
75 {
76         struct xdp_sock *xs;
77
78         if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
79                 return;
80
81         rcu_read_lock();
82         list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
83                 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
84         }
85         rcu_read_unlock();
86
87         pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
88 }
89 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
90
91 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
92 {
93         return pool->uses_need_wakeup;
94 }
95 EXPORT_SYMBOL(xsk_uses_need_wakeup);
96
97 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
98                                             u16 queue_id)
99 {
100         if (queue_id < dev->real_num_rx_queues)
101                 return dev->_rx[queue_id].pool;
102         if (queue_id < dev->real_num_tx_queues)
103                 return dev->_tx[queue_id].pool;
104
105         return NULL;
106 }
107 EXPORT_SYMBOL(xsk_get_pool_from_qid);
108
109 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
110 {
111         if (queue_id < dev->num_rx_queues)
112                 dev->_rx[queue_id].pool = NULL;
113         if (queue_id < dev->num_tx_queues)
114                 dev->_tx[queue_id].pool = NULL;
115 }
116
117 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
118  * not know if the device has more tx queues than rx, or the opposite.
119  * This might also change during run time.
120  */
121 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
122                         u16 queue_id)
123 {
124         if (queue_id >= max_t(unsigned int,
125                               dev->real_num_rx_queues,
126                               dev->real_num_tx_queues))
127                 return -EINVAL;
128
129         if (queue_id < dev->real_num_rx_queues)
130                 dev->_rx[queue_id].pool = pool;
131         if (queue_id < dev->real_num_tx_queues)
132                 dev->_tx[queue_id].pool = pool;
133
134         return 0;
135 }
136
137 void xp_release(struct xdp_buff_xsk *xskb)
138 {
139         xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
140 }
141
142 static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
143 {
144         u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
145
146         offset += xskb->pool->headroom;
147         if (!xskb->pool->unaligned)
148                 return xskb->orig_addr + offset;
149         return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
150 }
151
152 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
153 {
154         struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
155         u64 addr;
156         int err;
157
158         addr = xp_get_handle(xskb);
159         err = xskq_prod_reserve_desc(xs->rx, addr, len);
160         if (err) {
161                 xs->rx_queue_full++;
162                 return err;
163         }
164
165         xp_release(xskb);
166         return 0;
167 }
168
169 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
170 {
171         void *from_buf, *to_buf;
172         u32 metalen;
173
174         if (unlikely(xdp_data_meta_unsupported(from))) {
175                 from_buf = from->data;
176                 to_buf = to->data;
177                 metalen = 0;
178         } else {
179                 from_buf = from->data_meta;
180                 metalen = from->data - from->data_meta;
181                 to_buf = to->data - metalen;
182         }
183
184         memcpy(to_buf, from_buf, len + metalen);
185 }
186
187 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
188 {
189         struct xdp_buff *xsk_xdp;
190         int err;
191         u32 len;
192
193         len = xdp->data_end - xdp->data;
194         if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
195                 xs->rx_dropped++;
196                 return -ENOSPC;
197         }
198
199         xsk_xdp = xsk_buff_alloc(xs->pool);
200         if (!xsk_xdp) {
201                 xs->rx_dropped++;
202                 return -ENOSPC;
203         }
204
205         xsk_copy_xdp(xsk_xdp, xdp, len);
206         err = __xsk_rcv_zc(xs, xsk_xdp, len);
207         if (err) {
208                 xsk_buff_free(xsk_xdp);
209                 return err;
210         }
211         return 0;
212 }
213
214 static bool xsk_tx_writeable(struct xdp_sock *xs)
215 {
216         if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
217                 return false;
218
219         return true;
220 }
221
222 static bool xsk_is_bound(struct xdp_sock *xs)
223 {
224         if (READ_ONCE(xs->state) == XSK_BOUND) {
225                 /* Matches smp_wmb() in bind(). */
226                 smp_rmb();
227                 return true;
228         }
229         return false;
230 }
231
232 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
233 {
234         if (!xsk_is_bound(xs))
235                 return -EINVAL;
236
237         if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
238                 return -EINVAL;
239
240         sk_mark_napi_id_once_xdp(&xs->sk, xdp);
241         return 0;
242 }
243
244 static void xsk_flush(struct xdp_sock *xs)
245 {
246         xskq_prod_submit(xs->rx);
247         __xskq_cons_release(xs->pool->fq);
248         sock_def_readable(&xs->sk);
249 }
250
251 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
252 {
253         int err;
254
255         spin_lock_bh(&xs->rx_lock);
256         err = xsk_rcv_check(xs, xdp);
257         if (!err) {
258                 err = __xsk_rcv(xs, xdp);
259                 xsk_flush(xs);
260         }
261         spin_unlock_bh(&xs->rx_lock);
262         return err;
263 }
264
265 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
266 {
267         int err;
268         u32 len;
269
270         err = xsk_rcv_check(xs, xdp);
271         if (err)
272                 return err;
273
274         if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
275                 len = xdp->data_end - xdp->data;
276                 return __xsk_rcv_zc(xs, xdp, len);
277         }
278
279         err = __xsk_rcv(xs, xdp);
280         if (!err)
281                 xdp_return_buff(xdp);
282         return err;
283 }
284
285 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
286 {
287         struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
288         int err;
289
290         err = xsk_rcv(xs, xdp);
291         if (err)
292                 return err;
293
294         if (!xs->flush_node.prev)
295                 list_add(&xs->flush_node, flush_list);
296
297         return 0;
298 }
299
300 void __xsk_map_flush(void)
301 {
302         struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
303         struct xdp_sock *xs, *tmp;
304
305         list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
306                 xsk_flush(xs);
307                 __list_del_clearprev(&xs->flush_node);
308         }
309 }
310
311 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
312 {
313         xskq_prod_submit_n(pool->cq, nb_entries);
314 }
315 EXPORT_SYMBOL(xsk_tx_completed);
316
317 void xsk_tx_release(struct xsk_buff_pool *pool)
318 {
319         struct xdp_sock *xs;
320
321         rcu_read_lock();
322         list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
323                 __xskq_cons_release(xs->tx);
324                 if (xsk_tx_writeable(xs))
325                         xs->sk.sk_write_space(&xs->sk);
326         }
327         rcu_read_unlock();
328 }
329 EXPORT_SYMBOL(xsk_tx_release);
330
331 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
332 {
333         struct xdp_sock *xs;
334
335         rcu_read_lock();
336         list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
337                 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
338                         xs->tx->queue_empty_descs++;
339                         continue;
340                 }
341
342                 /* This is the backpressure mechanism for the Tx path.
343                  * Reserve space in the completion queue and only proceed
344                  * if there is space in it. This avoids having to implement
345                  * any buffering in the Tx path.
346                  */
347                 if (xskq_prod_reserve_addr(pool->cq, desc->addr))
348                         goto out;
349
350                 xskq_cons_release(xs->tx);
351                 rcu_read_unlock();
352                 return true;
353         }
354
355 out:
356         rcu_read_unlock();
357         return false;
358 }
359 EXPORT_SYMBOL(xsk_tx_peek_desc);
360
361 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
362                                         u32 max_entries)
363 {
364         u32 nb_pkts = 0;
365
366         while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
367                 nb_pkts++;
368
369         xsk_tx_release(pool);
370         return nb_pkts;
371 }
372
373 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
374                                    u32 max_entries)
375 {
376         struct xdp_sock *xs;
377         u32 nb_pkts;
378
379         rcu_read_lock();
380         if (!list_is_singular(&pool->xsk_tx_list)) {
381                 /* Fallback to the non-batched version */
382                 rcu_read_unlock();
383                 return xsk_tx_peek_release_fallback(pool, descs, max_entries);
384         }
385
386         xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
387         if (!xs) {
388                 nb_pkts = 0;
389                 goto out;
390         }
391
392         nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
393         if (!nb_pkts) {
394                 xs->tx->queue_empty_descs++;
395                 goto out;
396         }
397
398         /* This is the backpressure mechanism for the Tx path. Try to
399          * reserve space in the completion queue for all packets, but
400          * if there are fewer slots available, just process that many
401          * packets. This avoids having to implement any buffering in
402          * the Tx path.
403          */
404         nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
405         if (!nb_pkts)
406                 goto out;
407
408         xskq_cons_release_n(xs->tx, nb_pkts);
409         __xskq_cons_release(xs->tx);
410         xs->sk.sk_write_space(&xs->sk);
411
412 out:
413         rcu_read_unlock();
414         return nb_pkts;
415 }
416 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
417
418 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
419 {
420         struct net_device *dev = xs->dev;
421         int err;
422
423         rcu_read_lock();
424         err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
425         rcu_read_unlock();
426
427         return err;
428 }
429
430 static int xsk_zc_xmit(struct xdp_sock *xs)
431 {
432         return xsk_wakeup(xs, XDP_WAKEUP_TX);
433 }
434
435 static void xsk_destruct_skb(struct sk_buff *skb)
436 {
437         u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
438         struct xdp_sock *xs = xdp_sk(skb->sk);
439         unsigned long flags;
440
441         spin_lock_irqsave(&xs->pool->cq_lock, flags);
442         xskq_prod_submit_addr(xs->pool->cq, addr);
443         spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
444
445         sock_wfree(skb);
446 }
447
448 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
449                                               struct xdp_desc *desc)
450 {
451         struct xsk_buff_pool *pool = xs->pool;
452         u32 hr, len, ts, offset, copy, copied;
453         struct sk_buff *skb;
454         struct page *page;
455         void *buffer;
456         int err, i;
457         u64 addr;
458
459         hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
460
461         skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
462         if (unlikely(!skb))
463                 return ERR_PTR(err);
464
465         skb_reserve(skb, hr);
466
467         addr = desc->addr;
468         len = desc->len;
469         ts = pool->unaligned ? len : pool->chunk_size;
470
471         buffer = xsk_buff_raw_get_data(pool, addr);
472         offset = offset_in_page(buffer);
473         addr = buffer - pool->addrs;
474
475         for (copied = 0, i = 0; copied < len; i++) {
476                 page = pool->umem->pgs[addr >> PAGE_SHIFT];
477                 get_page(page);
478
479                 copy = min_t(u32, PAGE_SIZE - offset, len - copied);
480                 skb_fill_page_desc(skb, i, page, offset, copy);
481
482                 copied += copy;
483                 addr += copy;
484                 offset = 0;
485         }
486
487         skb->len += len;
488         skb->data_len += len;
489         skb->truesize += ts;
490
491         refcount_add(ts, &xs->sk.sk_wmem_alloc);
492
493         return skb;
494 }
495
496 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
497                                      struct xdp_desc *desc)
498 {
499         struct net_device *dev = xs->dev;
500         struct sk_buff *skb;
501
502         if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
503                 skb = xsk_build_skb_zerocopy(xs, desc);
504                 if (IS_ERR(skb))
505                         return skb;
506         } else {
507                 u32 hr, tr, len;
508                 void *buffer;
509                 int err;
510
511                 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
512                 tr = dev->needed_tailroom;
513                 len = desc->len;
514
515                 skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
516                 if (unlikely(!skb))
517                         return ERR_PTR(err);
518
519                 skb_reserve(skb, hr);
520                 skb_put(skb, len);
521
522                 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
523                 err = skb_store_bits(skb, 0, buffer, len);
524                 if (unlikely(err)) {
525                         kfree_skb(skb);
526                         return ERR_PTR(err);
527                 }
528         }
529
530         skb->dev = dev;
531         skb->priority = xs->sk.sk_priority;
532         skb->mark = xs->sk.sk_mark;
533         skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
534         skb->destructor = xsk_destruct_skb;
535
536         return skb;
537 }
538
539 static int xsk_generic_xmit(struct sock *sk)
540 {
541         struct xdp_sock *xs = xdp_sk(sk);
542         u32 max_batch = TX_BATCH_SIZE;
543         bool sent_frame = false;
544         struct xdp_desc desc;
545         struct sk_buff *skb;
546         unsigned long flags;
547         int err = 0;
548
549         mutex_lock(&xs->mutex);
550
551         if (xs->queue_id >= xs->dev->real_num_tx_queues)
552                 goto out;
553
554         while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
555                 if (max_batch-- == 0) {
556                         err = -EAGAIN;
557                         goto out;
558                 }
559
560                 skb = xsk_build_skb(xs, &desc);
561                 if (IS_ERR(skb)) {
562                         err = PTR_ERR(skb);
563                         goto out;
564                 }
565
566                 /* This is the backpressure mechanism for the Tx path.
567                  * Reserve space in the completion queue and only proceed
568                  * if there is space in it. This avoids having to implement
569                  * any buffering in the Tx path.
570                  */
571                 spin_lock_irqsave(&xs->pool->cq_lock, flags);
572                 if (xskq_prod_reserve(xs->pool->cq)) {
573                         spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
574                         kfree_skb(skb);
575                         goto out;
576                 }
577                 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
578
579                 err = __dev_direct_xmit(skb, xs->queue_id);
580                 if  (err == NETDEV_TX_BUSY) {
581                         /* Tell user-space to retry the send */
582                         skb->destructor = sock_wfree;
583                         spin_lock_irqsave(&xs->pool->cq_lock, flags);
584                         xskq_prod_cancel(xs->pool->cq);
585                         spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
586                         /* Free skb without triggering the perf drop trace */
587                         consume_skb(skb);
588                         err = -EAGAIN;
589                         goto out;
590                 }
591
592                 xskq_cons_release(xs->tx);
593                 /* Ignore NET_XMIT_CN as packet might have been sent */
594                 if (err == NET_XMIT_DROP) {
595                         /* SKB completed but not sent */
596                         err = -EBUSY;
597                         goto out;
598                 }
599
600                 sent_frame = true;
601         }
602
603         xs->tx->queue_empty_descs++;
604
605 out:
606         if (sent_frame)
607                 if (xsk_tx_writeable(xs))
608                         sk->sk_write_space(sk);
609
610         mutex_unlock(&xs->mutex);
611         return err;
612 }
613
614 static int __xsk_sendmsg(struct sock *sk)
615 {
616         struct xdp_sock *xs = xdp_sk(sk);
617
618         if (unlikely(!(xs->dev->flags & IFF_UP)))
619                 return -ENETDOWN;
620         if (unlikely(!xs->tx))
621                 return -ENOBUFS;
622
623         return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
624 }
625
626 static bool xsk_no_wakeup(struct sock *sk)
627 {
628 #ifdef CONFIG_NET_RX_BUSY_POLL
629         /* Prefer busy-polling, skip the wakeup. */
630         return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
631                 READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
632 #else
633         return false;
634 #endif
635 }
636
637 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
638 {
639         bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
640         struct sock *sk = sock->sk;
641         struct xdp_sock *xs = xdp_sk(sk);
642         struct xsk_buff_pool *pool;
643
644         if (unlikely(!xsk_is_bound(xs)))
645                 return -ENXIO;
646         if (unlikely(need_wait))
647                 return -EOPNOTSUPP;
648
649         if (sk_can_busy_loop(sk))
650                 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
651
652         if (xsk_no_wakeup(sk))
653                 return 0;
654
655         pool = xs->pool;
656         if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
657                 return __xsk_sendmsg(sk);
658         return 0;
659 }
660
661 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
662 {
663         bool need_wait = !(flags & MSG_DONTWAIT);
664         struct sock *sk = sock->sk;
665         struct xdp_sock *xs = xdp_sk(sk);
666
667         if (unlikely(!xsk_is_bound(xs)))
668                 return -ENXIO;
669         if (unlikely(!(xs->dev->flags & IFF_UP)))
670                 return -ENETDOWN;
671         if (unlikely(!xs->rx))
672                 return -ENOBUFS;
673         if (unlikely(need_wait))
674                 return -EOPNOTSUPP;
675
676         if (sk_can_busy_loop(sk))
677                 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
678
679         if (xsk_no_wakeup(sk))
680                 return 0;
681
682         if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
683                 return xsk_wakeup(xs, XDP_WAKEUP_RX);
684         return 0;
685 }
686
687 static __poll_t xsk_poll(struct file *file, struct socket *sock,
688                              struct poll_table_struct *wait)
689 {
690         __poll_t mask = 0;
691         struct sock *sk = sock->sk;
692         struct xdp_sock *xs = xdp_sk(sk);
693         struct xsk_buff_pool *pool;
694
695         sock_poll_wait(file, sock, wait);
696
697         if (unlikely(!xsk_is_bound(xs)))
698                 return mask;
699
700         pool = xs->pool;
701
702         if (pool->cached_need_wakeup) {
703                 if (xs->zc)
704                         xsk_wakeup(xs, pool->cached_need_wakeup);
705                 else
706                         /* Poll needs to drive Tx also in copy mode */
707                         __xsk_sendmsg(sk);
708         }
709
710         if (xs->rx && !xskq_prod_is_empty(xs->rx))
711                 mask |= EPOLLIN | EPOLLRDNORM;
712         if (xs->tx && xsk_tx_writeable(xs))
713                 mask |= EPOLLOUT | EPOLLWRNORM;
714
715         return mask;
716 }
717
718 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
719                           bool umem_queue)
720 {
721         struct xsk_queue *q;
722
723         if (entries == 0 || *queue || !is_power_of_2(entries))
724                 return -EINVAL;
725
726         q = xskq_create(entries, umem_queue);
727         if (!q)
728                 return -ENOMEM;
729
730         /* Make sure queue is ready before it can be seen by others */
731         smp_wmb();
732         WRITE_ONCE(*queue, q);
733         return 0;
734 }
735
736 static void xsk_unbind_dev(struct xdp_sock *xs)
737 {
738         struct net_device *dev = xs->dev;
739
740         if (xs->state != XSK_BOUND)
741                 return;
742         WRITE_ONCE(xs->state, XSK_UNBOUND);
743
744         /* Wait for driver to stop using the xdp socket. */
745         xp_del_xsk(xs->pool, xs);
746         xs->dev = NULL;
747         synchronize_net();
748         dev_put(dev);
749 }
750
751 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
752                                               struct xdp_sock __rcu ***map_entry)
753 {
754         struct xsk_map *map = NULL;
755         struct xsk_map_node *node;
756
757         *map_entry = NULL;
758
759         spin_lock_bh(&xs->map_list_lock);
760         node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
761                                         node);
762         if (node) {
763                 bpf_map_inc(&node->map->map);
764                 map = node->map;
765                 *map_entry = node->map_entry;
766         }
767         spin_unlock_bh(&xs->map_list_lock);
768         return map;
769 }
770
771 static void xsk_delete_from_maps(struct xdp_sock *xs)
772 {
773         /* This function removes the current XDP socket from all the
774          * maps it resides in. We need to take extra care here, due to
775          * the two locks involved. Each map has a lock synchronizing
776          * updates to the entries, and each socket has a lock that
777          * synchronizes access to the list of maps (map_list). For
778          * deadlock avoidance the locks need to be taken in the order
779          * "map lock"->"socket map list lock". We start off by
780          * accessing the socket map list, and take a reference to the
781          * map to guarantee existence between the
782          * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
783          * calls. Then we ask the map to remove the socket, which
784          * tries to remove the socket from the map. Note that there
785          * might be updates to the map between
786          * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
787          */
788         struct xdp_sock __rcu **map_entry = NULL;
789         struct xsk_map *map;
790
791         while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
792                 xsk_map_try_sock_delete(map, xs, map_entry);
793                 bpf_map_put(&map->map);
794         }
795 }
796
797 static int xsk_release(struct socket *sock)
798 {
799         struct sock *sk = sock->sk;
800         struct xdp_sock *xs = xdp_sk(sk);
801         struct net *net;
802
803         if (!sk)
804                 return 0;
805
806         net = sock_net(sk);
807
808         mutex_lock(&net->xdp.lock);
809         sk_del_node_init_rcu(sk);
810         mutex_unlock(&net->xdp.lock);
811
812         local_bh_disable();
813         sock_prot_inuse_add(net, sk->sk_prot, -1);
814         local_bh_enable();
815
816         xsk_delete_from_maps(xs);
817         mutex_lock(&xs->mutex);
818         xsk_unbind_dev(xs);
819         mutex_unlock(&xs->mutex);
820
821         xskq_destroy(xs->rx);
822         xskq_destroy(xs->tx);
823         xskq_destroy(xs->fq_tmp);
824         xskq_destroy(xs->cq_tmp);
825
826         sock_orphan(sk);
827         sock->sk = NULL;
828
829         sk_refcnt_debug_release(sk);
830         sock_put(sk);
831
832         return 0;
833 }
834
835 static struct socket *xsk_lookup_xsk_from_fd(int fd)
836 {
837         struct socket *sock;
838         int err;
839
840         sock = sockfd_lookup(fd, &err);
841         if (!sock)
842                 return ERR_PTR(-ENOTSOCK);
843
844         if (sock->sk->sk_family != PF_XDP) {
845                 sockfd_put(sock);
846                 return ERR_PTR(-ENOPROTOOPT);
847         }
848
849         return sock;
850 }
851
852 static bool xsk_validate_queues(struct xdp_sock *xs)
853 {
854         return xs->fq_tmp && xs->cq_tmp;
855 }
856
857 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
858 {
859         struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
860         struct sock *sk = sock->sk;
861         struct xdp_sock *xs = xdp_sk(sk);
862         struct net_device *dev;
863         u32 flags, qid;
864         int err = 0;
865
866         if (addr_len < sizeof(struct sockaddr_xdp))
867                 return -EINVAL;
868         if (sxdp->sxdp_family != AF_XDP)
869                 return -EINVAL;
870
871         flags = sxdp->sxdp_flags;
872         if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
873                       XDP_USE_NEED_WAKEUP))
874                 return -EINVAL;
875
876         rtnl_lock();
877         mutex_lock(&xs->mutex);
878         if (xs->state != XSK_READY) {
879                 err = -EBUSY;
880                 goto out_release;
881         }
882
883         dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
884         if (!dev) {
885                 err = -ENODEV;
886                 goto out_release;
887         }
888
889         if (!xs->rx && !xs->tx) {
890                 err = -EINVAL;
891                 goto out_unlock;
892         }
893
894         qid = sxdp->sxdp_queue_id;
895
896         if (flags & XDP_SHARED_UMEM) {
897                 struct xdp_sock *umem_xs;
898                 struct socket *sock;
899
900                 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
901                     (flags & XDP_USE_NEED_WAKEUP)) {
902                         /* Cannot specify flags for shared sockets. */
903                         err = -EINVAL;
904                         goto out_unlock;
905                 }
906
907                 if (xs->umem) {
908                         /* We have already our own. */
909                         err = -EINVAL;
910                         goto out_unlock;
911                 }
912
913                 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
914                 if (IS_ERR(sock)) {
915                         err = PTR_ERR(sock);
916                         goto out_unlock;
917                 }
918
919                 umem_xs = xdp_sk(sock->sk);
920                 if (!xsk_is_bound(umem_xs)) {
921                         err = -EBADF;
922                         sockfd_put(sock);
923                         goto out_unlock;
924                 }
925
926                 if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
927                         /* Share the umem with another socket on another qid
928                          * and/or device.
929                          */
930                         xs->pool = xp_create_and_assign_umem(xs,
931                                                              umem_xs->umem);
932                         if (!xs->pool) {
933                                 err = -ENOMEM;
934                                 sockfd_put(sock);
935                                 goto out_unlock;
936                         }
937
938                         err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
939                                                    dev, qid);
940                         if (err) {
941                                 xp_destroy(xs->pool);
942                                 xs->pool = NULL;
943                                 sockfd_put(sock);
944                                 goto out_unlock;
945                         }
946                 } else {
947                         /* Share the buffer pool with the other socket. */
948                         if (xs->fq_tmp || xs->cq_tmp) {
949                                 /* Do not allow setting your own fq or cq. */
950                                 err = -EINVAL;
951                                 sockfd_put(sock);
952                                 goto out_unlock;
953                         }
954
955                         xp_get_pool(umem_xs->pool);
956                         xs->pool = umem_xs->pool;
957                 }
958
959                 xdp_get_umem(umem_xs->umem);
960                 WRITE_ONCE(xs->umem, umem_xs->umem);
961                 sockfd_put(sock);
962         } else if (!xs->umem || !xsk_validate_queues(xs)) {
963                 err = -EINVAL;
964                 goto out_unlock;
965         } else {
966                 /* This xsk has its own umem. */
967                 xs->pool = xp_create_and_assign_umem(xs, xs->umem);
968                 if (!xs->pool) {
969                         err = -ENOMEM;
970                         goto out_unlock;
971                 }
972
973                 err = xp_assign_dev(xs->pool, dev, qid, flags);
974                 if (err) {
975                         xp_destroy(xs->pool);
976                         xs->pool = NULL;
977                         goto out_unlock;
978                 }
979         }
980
981         /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
982         xs->fq_tmp = NULL;
983         xs->cq_tmp = NULL;
984
985         xs->dev = dev;
986         xs->zc = xs->umem->zc;
987         xs->queue_id = qid;
988         xp_add_xsk(xs->pool, xs);
989
990 out_unlock:
991         if (err) {
992                 dev_put(dev);
993         } else {
994                 /* Matches smp_rmb() in bind() for shared umem
995                  * sockets, and xsk_is_bound().
996                  */
997                 smp_wmb();
998                 WRITE_ONCE(xs->state, XSK_BOUND);
999         }
1000 out_release:
1001         mutex_unlock(&xs->mutex);
1002         rtnl_unlock();
1003         return err;
1004 }
1005
1006 struct xdp_umem_reg_v1 {
1007         __u64 addr; /* Start of packet data area */
1008         __u64 len; /* Length of packet data area */
1009         __u32 chunk_size;
1010         __u32 headroom;
1011 };
1012
1013 static int xsk_setsockopt(struct socket *sock, int level, int optname,
1014                           sockptr_t optval, unsigned int optlen)
1015 {
1016         struct sock *sk = sock->sk;
1017         struct xdp_sock *xs = xdp_sk(sk);
1018         int err;
1019
1020         if (level != SOL_XDP)
1021                 return -ENOPROTOOPT;
1022
1023         switch (optname) {
1024         case XDP_RX_RING:
1025         case XDP_TX_RING:
1026         {
1027                 struct xsk_queue **q;
1028                 int entries;
1029
1030                 if (optlen < sizeof(entries))
1031                         return -EINVAL;
1032                 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1033                         return -EFAULT;
1034
1035                 mutex_lock(&xs->mutex);
1036                 if (xs->state != XSK_READY) {
1037                         mutex_unlock(&xs->mutex);
1038                         return -EBUSY;
1039                 }
1040                 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1041                 err = xsk_init_queue(entries, q, false);
1042                 if (!err && optname == XDP_TX_RING)
1043                         /* Tx needs to be explicitly woken up the first time */
1044                         xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1045                 mutex_unlock(&xs->mutex);
1046                 return err;
1047         }
1048         case XDP_UMEM_REG:
1049         {
1050                 size_t mr_size = sizeof(struct xdp_umem_reg);
1051                 struct xdp_umem_reg mr = {};
1052                 struct xdp_umem *umem;
1053
1054                 if (optlen < sizeof(struct xdp_umem_reg_v1))
1055                         return -EINVAL;
1056                 else if (optlen < sizeof(mr))
1057                         mr_size = sizeof(struct xdp_umem_reg_v1);
1058
1059                 if (copy_from_sockptr(&mr, optval, mr_size))
1060                         return -EFAULT;
1061
1062                 mutex_lock(&xs->mutex);
1063                 if (xs->state != XSK_READY || xs->umem) {
1064                         mutex_unlock(&xs->mutex);
1065                         return -EBUSY;
1066                 }
1067
1068                 umem = xdp_umem_create(&mr);
1069                 if (IS_ERR(umem)) {
1070                         mutex_unlock(&xs->mutex);
1071                         return PTR_ERR(umem);
1072                 }
1073
1074                 /* Make sure umem is ready before it can be seen by others */
1075                 smp_wmb();
1076                 WRITE_ONCE(xs->umem, umem);
1077                 mutex_unlock(&xs->mutex);
1078                 return 0;
1079         }
1080         case XDP_UMEM_FILL_RING:
1081         case XDP_UMEM_COMPLETION_RING:
1082         {
1083                 struct xsk_queue **q;
1084                 int entries;
1085
1086                 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1087                         return -EFAULT;
1088
1089                 mutex_lock(&xs->mutex);
1090                 if (xs->state != XSK_READY) {
1091                         mutex_unlock(&xs->mutex);
1092                         return -EBUSY;
1093                 }
1094
1095                 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1096                         &xs->cq_tmp;
1097                 err = xsk_init_queue(entries, q, true);
1098                 mutex_unlock(&xs->mutex);
1099                 return err;
1100         }
1101         default:
1102                 break;
1103         }
1104
1105         return -ENOPROTOOPT;
1106 }
1107
1108 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1109 {
1110         ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1111         ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1112         ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1113 }
1114
1115 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1116 {
1117         ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1118         ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1119         ring->desc = offsetof(struct xdp_umem_ring, desc);
1120 }
1121
1122 struct xdp_statistics_v1 {
1123         __u64 rx_dropped;
1124         __u64 rx_invalid_descs;
1125         __u64 tx_invalid_descs;
1126 };
1127
1128 static int xsk_getsockopt(struct socket *sock, int level, int optname,
1129                           char __user *optval, int __user *optlen)
1130 {
1131         struct sock *sk = sock->sk;
1132         struct xdp_sock *xs = xdp_sk(sk);
1133         int len;
1134
1135         if (level != SOL_XDP)
1136                 return -ENOPROTOOPT;
1137
1138         if (get_user(len, optlen))
1139                 return -EFAULT;
1140         if (len < 0)
1141                 return -EINVAL;
1142
1143         switch (optname) {
1144         case XDP_STATISTICS:
1145         {
1146                 struct xdp_statistics stats = {};
1147                 bool extra_stats = true;
1148                 size_t stats_size;
1149
1150                 if (len < sizeof(struct xdp_statistics_v1)) {
1151                         return -EINVAL;
1152                 } else if (len < sizeof(stats)) {
1153                         extra_stats = false;
1154                         stats_size = sizeof(struct xdp_statistics_v1);
1155                 } else {
1156                         stats_size = sizeof(stats);
1157                 }
1158
1159                 mutex_lock(&xs->mutex);
1160                 stats.rx_dropped = xs->rx_dropped;
1161                 if (extra_stats) {
1162                         stats.rx_ring_full = xs->rx_queue_full;
1163                         stats.rx_fill_ring_empty_descs =
1164                                 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1165                         stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1166                 } else {
1167                         stats.rx_dropped += xs->rx_queue_full;
1168                 }
1169                 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1170                 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1171                 mutex_unlock(&xs->mutex);
1172
1173                 if (copy_to_user(optval, &stats, stats_size))
1174                         return -EFAULT;
1175                 if (put_user(stats_size, optlen))
1176                         return -EFAULT;
1177
1178                 return 0;
1179         }
1180         case XDP_MMAP_OFFSETS:
1181         {
1182                 struct xdp_mmap_offsets off;
1183                 struct xdp_mmap_offsets_v1 off_v1;
1184                 bool flags_supported = true;
1185                 void *to_copy;
1186
1187                 if (len < sizeof(off_v1))
1188                         return -EINVAL;
1189                 else if (len < sizeof(off))
1190                         flags_supported = false;
1191
1192                 if (flags_supported) {
1193                         /* xdp_ring_offset is identical to xdp_ring_offset_v1
1194                          * except for the flags field added to the end.
1195                          */
1196                         xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1197                                                &off.rx);
1198                         xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1199                                                &off.tx);
1200                         xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1201                                                &off.fr);
1202                         xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1203                                                &off.cr);
1204                         off.rx.flags = offsetof(struct xdp_rxtx_ring,
1205                                                 ptrs.flags);
1206                         off.tx.flags = offsetof(struct xdp_rxtx_ring,
1207                                                 ptrs.flags);
1208                         off.fr.flags = offsetof(struct xdp_umem_ring,
1209                                                 ptrs.flags);
1210                         off.cr.flags = offsetof(struct xdp_umem_ring,
1211                                                 ptrs.flags);
1212
1213                         len = sizeof(off);
1214                         to_copy = &off;
1215                 } else {
1216                         xsk_enter_rxtx_offsets(&off_v1.rx);
1217                         xsk_enter_rxtx_offsets(&off_v1.tx);
1218                         xsk_enter_umem_offsets(&off_v1.fr);
1219                         xsk_enter_umem_offsets(&off_v1.cr);
1220
1221                         len = sizeof(off_v1);
1222                         to_copy = &off_v1;
1223                 }
1224
1225                 if (copy_to_user(optval, to_copy, len))
1226                         return -EFAULT;
1227                 if (put_user(len, optlen))
1228                         return -EFAULT;
1229
1230                 return 0;
1231         }
1232         case XDP_OPTIONS:
1233         {
1234                 struct xdp_options opts = {};
1235
1236                 if (len < sizeof(opts))
1237                         return -EINVAL;
1238
1239                 mutex_lock(&xs->mutex);
1240                 if (xs->zc)
1241                         opts.flags |= XDP_OPTIONS_ZEROCOPY;
1242                 mutex_unlock(&xs->mutex);
1243
1244                 len = sizeof(opts);
1245                 if (copy_to_user(optval, &opts, len))
1246                         return -EFAULT;
1247                 if (put_user(len, optlen))
1248                         return -EFAULT;
1249
1250                 return 0;
1251         }
1252         default:
1253                 break;
1254         }
1255
1256         return -EOPNOTSUPP;
1257 }
1258
1259 static int xsk_mmap(struct file *file, struct socket *sock,
1260                     struct vm_area_struct *vma)
1261 {
1262         loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1263         unsigned long size = vma->vm_end - vma->vm_start;
1264         struct xdp_sock *xs = xdp_sk(sock->sk);
1265         struct xsk_queue *q = NULL;
1266         unsigned long pfn;
1267         struct page *qpg;
1268
1269         if (READ_ONCE(xs->state) != XSK_READY)
1270                 return -EBUSY;
1271
1272         if (offset == XDP_PGOFF_RX_RING) {
1273                 q = READ_ONCE(xs->rx);
1274         } else if (offset == XDP_PGOFF_TX_RING) {
1275                 q = READ_ONCE(xs->tx);
1276         } else {
1277                 /* Matches the smp_wmb() in XDP_UMEM_REG */
1278                 smp_rmb();
1279                 if (offset == XDP_UMEM_PGOFF_FILL_RING)
1280                         q = READ_ONCE(xs->fq_tmp);
1281                 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1282                         q = READ_ONCE(xs->cq_tmp);
1283         }
1284
1285         if (!q)
1286                 return -EINVAL;
1287
1288         /* Matches the smp_wmb() in xsk_init_queue */
1289         smp_rmb();
1290         qpg = virt_to_head_page(q->ring);
1291         if (size > page_size(qpg))
1292                 return -EINVAL;
1293
1294         pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1295         return remap_pfn_range(vma, vma->vm_start, pfn,
1296                                size, vma->vm_page_prot);
1297 }
1298
1299 static int xsk_notifier(struct notifier_block *this,
1300                         unsigned long msg, void *ptr)
1301 {
1302         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1303         struct net *net = dev_net(dev);
1304         struct sock *sk;
1305
1306         switch (msg) {
1307         case NETDEV_UNREGISTER:
1308                 mutex_lock(&net->xdp.lock);
1309                 sk_for_each(sk, &net->xdp.list) {
1310                         struct xdp_sock *xs = xdp_sk(sk);
1311
1312                         mutex_lock(&xs->mutex);
1313                         if (xs->dev == dev) {
1314                                 sk->sk_err = ENETDOWN;
1315                                 if (!sock_flag(sk, SOCK_DEAD))
1316                                         sk_error_report(sk);
1317
1318                                 xsk_unbind_dev(xs);
1319
1320                                 /* Clear device references. */
1321                                 xp_clear_dev(xs->pool);
1322                         }
1323                         mutex_unlock(&xs->mutex);
1324                 }
1325                 mutex_unlock(&net->xdp.lock);
1326                 break;
1327         }
1328         return NOTIFY_DONE;
1329 }
1330
1331 static struct proto xsk_proto = {
1332         .name =         "XDP",
1333         .owner =        THIS_MODULE,
1334         .obj_size =     sizeof(struct xdp_sock),
1335 };
1336
1337 static const struct proto_ops xsk_proto_ops = {
1338         .family         = PF_XDP,
1339         .owner          = THIS_MODULE,
1340         .release        = xsk_release,
1341         .bind           = xsk_bind,
1342         .connect        = sock_no_connect,
1343         .socketpair     = sock_no_socketpair,
1344         .accept         = sock_no_accept,
1345         .getname        = sock_no_getname,
1346         .poll           = xsk_poll,
1347         .ioctl          = sock_no_ioctl,
1348         .listen         = sock_no_listen,
1349         .shutdown       = sock_no_shutdown,
1350         .setsockopt     = xsk_setsockopt,
1351         .getsockopt     = xsk_getsockopt,
1352         .sendmsg        = xsk_sendmsg,
1353         .recvmsg        = xsk_recvmsg,
1354         .mmap           = xsk_mmap,
1355         .sendpage       = sock_no_sendpage,
1356 };
1357
1358 static void xsk_destruct(struct sock *sk)
1359 {
1360         struct xdp_sock *xs = xdp_sk(sk);
1361
1362         if (!sock_flag(sk, SOCK_DEAD))
1363                 return;
1364
1365         if (!xp_put_pool(xs->pool))
1366                 xdp_put_umem(xs->umem, !xs->pool);
1367
1368         sk_refcnt_debug_dec(sk);
1369 }
1370
1371 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1372                       int kern)
1373 {
1374         struct xdp_sock *xs;
1375         struct sock *sk;
1376
1377         if (!ns_capable(net->user_ns, CAP_NET_RAW))
1378                 return -EPERM;
1379         if (sock->type != SOCK_RAW)
1380                 return -ESOCKTNOSUPPORT;
1381
1382         if (protocol)
1383                 return -EPROTONOSUPPORT;
1384
1385         sock->state = SS_UNCONNECTED;
1386
1387         sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1388         if (!sk)
1389                 return -ENOBUFS;
1390
1391         sock->ops = &xsk_proto_ops;
1392
1393         sock_init_data(sock, sk);
1394
1395         sk->sk_family = PF_XDP;
1396
1397         sk->sk_destruct = xsk_destruct;
1398         sk_refcnt_debug_inc(sk);
1399
1400         sock_set_flag(sk, SOCK_RCU_FREE);
1401
1402         xs = xdp_sk(sk);
1403         xs->state = XSK_READY;
1404         mutex_init(&xs->mutex);
1405         spin_lock_init(&xs->rx_lock);
1406
1407         INIT_LIST_HEAD(&xs->map_list);
1408         spin_lock_init(&xs->map_list_lock);
1409
1410         mutex_lock(&net->xdp.lock);
1411         sk_add_node_rcu(sk, &net->xdp.list);
1412         mutex_unlock(&net->xdp.lock);
1413
1414         local_bh_disable();
1415         sock_prot_inuse_add(net, &xsk_proto, 1);
1416         local_bh_enable();
1417
1418         return 0;
1419 }
1420
1421 static const struct net_proto_family xsk_family_ops = {
1422         .family = PF_XDP,
1423         .create = xsk_create,
1424         .owner  = THIS_MODULE,
1425 };
1426
1427 static struct notifier_block xsk_netdev_notifier = {
1428         .notifier_call  = xsk_notifier,
1429 };
1430
1431 static int __net_init xsk_net_init(struct net *net)
1432 {
1433         mutex_init(&net->xdp.lock);
1434         INIT_HLIST_HEAD(&net->xdp.list);
1435         return 0;
1436 }
1437
1438 static void __net_exit xsk_net_exit(struct net *net)
1439 {
1440         WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1441 }
1442
1443 static struct pernet_operations xsk_net_ops = {
1444         .init = xsk_net_init,
1445         .exit = xsk_net_exit,
1446 };
1447
1448 static int __init xsk_init(void)
1449 {
1450         int err, cpu;
1451
1452         err = proto_register(&xsk_proto, 0 /* no slab */);
1453         if (err)
1454                 goto out;
1455
1456         err = sock_register(&xsk_family_ops);
1457         if (err)
1458                 goto out_proto;
1459
1460         err = register_pernet_subsys(&xsk_net_ops);
1461         if (err)
1462                 goto out_sk;
1463
1464         err = register_netdevice_notifier(&xsk_netdev_notifier);
1465         if (err)
1466                 goto out_pernet;
1467
1468         for_each_possible_cpu(cpu)
1469                 INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1470         return 0;
1471
1472 out_pernet:
1473         unregister_pernet_subsys(&xsk_net_ops);
1474 out_sk:
1475         sock_unregister(PF_XDP);
1476 out_proto:
1477         proto_unregister(&xsk_proto);
1478 out:
1479         return err;
1480 }
1481
1482 fs_initcall(xsk_init);