1 /* Copyright (C) 2009 Red Hat, Inc.
2 * Author: Michael S. Tsirkin <mst@redhat.com>
4 * This work is licensed under the terms of the GNU GPL, version 2.
6 * virtio-net server in host kernel.
9 #include <linux/compat.h>
10 #include <linux/eventfd.h>
11 #include <linux/vhost.h>
12 #include <linux/virtio_net.h>
13 #include <linux/miscdevice.h>
14 #include <linux/module.h>
15 #include <linux/moduleparam.h>
16 #include <linux/mutex.h>
17 #include <linux/workqueue.h>
18 #include <linux/file.h>
19 #include <linux/slab.h>
20 #include <linux/sched/clock.h>
21 #include <linux/vmalloc.h>
23 #include <linux/net.h>
24 #include <linux/if_packet.h>
25 #include <linux/if_arp.h>
26 #include <linux/if_tun.h>
27 #include <linux/if_macvlan.h>
28 #include <linux/if_tap.h>
29 #include <linux/if_vlan.h>
35 static int experimental_zcopytx = 1;
36 module_param(experimental_zcopytx, int, 0444);
37 MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
38 " 1 -Enable; 0 - Disable");
40 /* Max number of bytes transferred before requeueing the job.
41 * Using this limit prevents one virtqueue from starving others. */
42 #define VHOST_NET_WEIGHT 0x80000
44 /* MAX number of TX used buffers for outstanding zerocopy */
45 #define VHOST_MAX_PEND 128
46 #define VHOST_GOODCOPY_LEN 256
49 * For transmit, used buffer len is unused; we override it to track buffer
50 * status internally; used for zerocopy tx only.
52 /* Lower device DMA failed */
53 #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3)
54 /* Lower device DMA done */
55 #define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
56 /* Lower device DMA in progress */
57 #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1)
59 #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0)
61 #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
64 VHOST_NET_FEATURES = VHOST_FEATURES |
65 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
66 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
67 (1ULL << VIRTIO_F_IOMMU_PLATFORM)
76 struct vhost_net_ubuf_ref {
77 /* refcount follows semantics similar to kref:
78 * 0: object is released
79 * 1: no outstanding ubufs
80 * >1: outstanding ubufs
83 wait_queue_head_t wait;
84 struct vhost_virtqueue *vq;
87 struct vhost_net_virtqueue {
88 struct vhost_virtqueue vq;
91 /* vhost zerocopy support fields below: */
92 /* last used idx for outstanding DMA zerocopy buffers */
94 /* first used idx for DMA done zerocopy buffers */
96 /* an array of userspace buffers info */
97 struct ubuf_info *ubuf_info;
98 /* Reference counting for outstanding ubufs.
99 * Protected by vq mutex. Writers must also take device mutex. */
100 struct vhost_net_ubuf_ref *ubufs;
104 struct vhost_dev dev;
105 struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
106 struct vhost_poll poll[VHOST_NET_VQ_MAX];
107 /* Number of TX recently submitted.
108 * Protected by tx vq lock. */
110 /* Number of times zerocopy TX recently failed.
111 * Protected by tx vq lock. */
112 unsigned tx_zcopy_err;
113 /* Flush in progress. Protected by tx vq lock. */
117 static unsigned vhost_net_zcopy_mask __read_mostly;
119 static void vhost_net_enable_zcopy(int vq)
121 vhost_net_zcopy_mask |= 0x1 << vq;
124 static struct vhost_net_ubuf_ref *
125 vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
127 struct vhost_net_ubuf_ref *ubufs;
128 /* No zero copy backend? Nothing to count. */
131 ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
133 return ERR_PTR(-ENOMEM);
134 atomic_set(&ubufs->refcount, 1);
135 init_waitqueue_head(&ubufs->wait);
140 static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
142 int r = atomic_sub_return(1, &ubufs->refcount);
144 wake_up(&ubufs->wait);
148 static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
150 vhost_net_ubuf_put(ubufs);
151 wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
154 static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
156 vhost_net_ubuf_put_and_wait(ubufs);
160 static void vhost_net_clear_ubuf_info(struct vhost_net *n)
164 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
165 kfree(n->vqs[i].ubuf_info);
166 n->vqs[i].ubuf_info = NULL;
170 static int vhost_net_set_ubuf_info(struct vhost_net *n)
175 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
176 zcopy = vhost_net_zcopy_mask & (0x1 << i);
179 n->vqs[i].ubuf_info = kmalloc(sizeof(*n->vqs[i].ubuf_info) *
180 UIO_MAXIOV, GFP_KERNEL);
181 if (!n->vqs[i].ubuf_info)
187 vhost_net_clear_ubuf_info(n);
191 static void vhost_net_vq_reset(struct vhost_net *n)
195 vhost_net_clear_ubuf_info(n);
197 for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
198 n->vqs[i].done_idx = 0;
199 n->vqs[i].upend_idx = 0;
200 n->vqs[i].ubufs = NULL;
201 n->vqs[i].vhost_hlen = 0;
202 n->vqs[i].sock_hlen = 0;
207 static void vhost_net_tx_packet(struct vhost_net *net)
210 if (net->tx_packets < 1024)
213 net->tx_zcopy_err = 0;
216 static void vhost_net_tx_err(struct vhost_net *net)
221 static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
223 /* TX flush waits for outstanding DMAs to be done.
224 * Don't start new DMAs.
226 return !net->tx_flush &&
227 net->tx_packets / 64 >= net->tx_zcopy_err;
230 static bool vhost_sock_zcopy(struct socket *sock)
232 return unlikely(experimental_zcopytx) &&
233 sock_flag(sock->sk, SOCK_ZEROCOPY);
236 /* In case of DMA done not in order in lower device driver for some reason.
237 * upend_idx is used to track end of used idx, done_idx is used to track head
238 * of used idx. Once lower device DMA done contiguously, we will signal KVM
241 static void vhost_zerocopy_signal_used(struct vhost_net *net,
242 struct vhost_virtqueue *vq)
244 struct vhost_net_virtqueue *nvq =
245 container_of(vq, struct vhost_net_virtqueue, vq);
249 for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
250 if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
251 vhost_net_tx_err(net);
252 if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
253 vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
259 add = min(UIO_MAXIOV - nvq->done_idx, j);
260 vhost_add_used_and_signal_n(vq->dev, vq,
261 &vq->heads[nvq->done_idx], add);
262 nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
267 static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
269 struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
270 struct vhost_virtqueue *vq = ubufs->vq;
275 /* set len to mark this desc buffers done DMA */
276 vq->heads[ubuf->desc].len = success ?
277 VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
278 cnt = vhost_net_ubuf_put(ubufs);
281 * Trigger polling thread if guest stopped submitting new buffers:
282 * in this case, the refcount after decrement will eventually reach 1.
283 * We also trigger polling periodically after each 16 packets
284 * (the value 16 here is more or less arbitrary, it's tuned to trigger
285 * less than 10% of times).
287 if (cnt <= 1 || !(cnt % 16))
288 vhost_poll_queue(&vq->poll);
290 rcu_read_unlock_bh();
293 static inline unsigned long busy_clock(void)
295 return local_clock() >> 10;
298 static bool vhost_can_busy_poll(struct vhost_dev *dev,
299 unsigned long endtime)
301 return likely(!need_resched()) &&
302 likely(!time_after(busy_clock(), endtime)) &&
303 likely(!signal_pending(current)) &&
304 !vhost_has_work(dev);
307 static void vhost_net_disable_vq(struct vhost_net *n,
308 struct vhost_virtqueue *vq)
310 struct vhost_net_virtqueue *nvq =
311 container_of(vq, struct vhost_net_virtqueue, vq);
312 struct vhost_poll *poll = n->poll + (nvq - n->vqs);
313 if (!vq->private_data)
315 vhost_poll_stop(poll);
318 static int vhost_net_enable_vq(struct vhost_net *n,
319 struct vhost_virtqueue *vq)
321 struct vhost_net_virtqueue *nvq =
322 container_of(vq, struct vhost_net_virtqueue, vq);
323 struct vhost_poll *poll = n->poll + (nvq - n->vqs);
326 sock = vq->private_data;
330 return vhost_poll_start(poll, sock->file);
333 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
334 struct vhost_virtqueue *vq,
335 struct iovec iov[], unsigned int iov_size,
336 unsigned int *out_num, unsigned int *in_num)
338 unsigned long uninitialized_var(endtime);
339 int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
340 out_num, in_num, NULL, NULL);
342 if (r == vq->num && vq->busyloop_timeout) {
344 endtime = busy_clock() + vq->busyloop_timeout;
345 while (vhost_can_busy_poll(vq->dev, endtime) &&
346 vhost_vq_avail_empty(vq->dev, vq))
349 r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
350 out_num, in_num, NULL, NULL);
356 static bool vhost_exceeds_maxpend(struct vhost_net *net)
358 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
359 struct vhost_virtqueue *vq = &nvq->vq;
361 return (nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV
365 /* Expects to be always run from workqueue - which acts as
366 * read-size critical section for our kind of RCU. */
367 static void handle_tx(struct vhost_net *net)
369 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
370 struct vhost_virtqueue *vq = &nvq->vq;
373 struct msghdr msg = {
378 .msg_flags = MSG_DONTWAIT,
380 size_t len, total_len = 0;
384 struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
385 bool zcopy, zcopy_used;
387 mutex_lock(&vq->mutex);
388 sock = vq->private_data;
392 if (!vq_iotlb_prefetch(vq))
395 vhost_disable_notify(&net->dev, vq);
397 hdr_size = nvq->vhost_hlen;
401 /* Release DMAs done buffers first */
403 vhost_zerocopy_signal_used(net, vq);
405 /* If more outstanding DMAs, queue the work.
406 * Handle upend_idx wrap around
408 if (unlikely(vhost_exceeds_maxpend(net)))
411 head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
414 /* On error, stop handling until the next kick. */
415 if (unlikely(head < 0))
417 /* Nothing new? Wait for eventfd to tell us they refilled. */
418 if (head == vq->num) {
419 if (unlikely(vhost_enable_notify(&net->dev, vq))) {
420 vhost_disable_notify(&net->dev, vq);
426 vq_err(vq, "Unexpected descriptor format for TX: "
427 "out %d, int %d\n", out, in);
430 /* Skip header. TODO: support TSO. */
431 len = iov_length(vq->iov, out);
432 iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
433 iov_iter_advance(&msg.msg_iter, hdr_size);
435 if (!msg_data_left(&msg)) {
436 vq_err(vq, "Unexpected header len for TX: "
437 "%zd expected %zd\n",
441 len = msg_data_left(&msg);
443 zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
444 && (nvq->upend_idx + 1) % UIO_MAXIOV !=
446 && vhost_net_tx_select_zcopy(net);
448 /* use msg_control to pass vhost zerocopy ubuf info to skb */
450 struct ubuf_info *ubuf;
451 ubuf = nvq->ubuf_info + nvq->upend_idx;
453 vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
454 vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
455 ubuf->callback = vhost_zerocopy_callback;
456 ubuf->ctx = nvq->ubufs;
457 ubuf->desc = nvq->upend_idx;
458 msg.msg_control = ubuf;
459 msg.msg_controllen = sizeof(ubuf);
461 atomic_inc(&ubufs->refcount);
462 nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
464 msg.msg_control = NULL;
469 if (total_len < VHOST_NET_WEIGHT &&
470 !vhost_vq_avail_empty(&net->dev, vq) &&
471 likely(!vhost_exceeds_maxpend(net))) {
472 msg.msg_flags |= MSG_MORE;
474 msg.msg_flags &= ~MSG_MORE;
477 /* TODO: Check specific error and bomb out unless ENOBUFS? */
478 err = sock->ops->sendmsg(sock, &msg, len);
479 if (unlikely(err < 0)) {
481 vhost_net_ubuf_put(ubufs);
482 nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
485 vhost_discard_vq_desc(vq, 1);
489 pr_debug("Truncated TX packet: "
490 " len %d != %zd\n", err, len);
492 vhost_add_used_and_signal(&net->dev, vq, head, 0);
494 vhost_zerocopy_signal_used(net, vq);
495 vhost_net_tx_packet(net);
496 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
497 vhost_poll_queue(&vq->poll);
502 mutex_unlock(&vq->mutex);
505 static int peek_head_len(struct sock *sk)
507 struct socket *sock = sk->sk_socket;
508 struct sk_buff *head;
512 if (sock->ops->peek_len)
513 return sock->ops->peek_len(sock);
515 spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
516 head = skb_peek(&sk->sk_receive_queue);
519 if (skb_vlan_tag_present(head))
523 spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
527 static int sk_has_rx_data(struct sock *sk)
529 struct socket *sock = sk->sk_socket;
531 if (sock->ops->peek_len)
532 return sock->ops->peek_len(sock);
534 return skb_queue_empty(&sk->sk_receive_queue);
537 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
539 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
540 struct vhost_virtqueue *vq = &nvq->vq;
541 unsigned long uninitialized_var(endtime);
542 int len = peek_head_len(sk);
544 if (!len && vq->busyloop_timeout) {
545 /* Both tx vq and rx socket were polled here */
546 mutex_lock(&vq->mutex);
547 vhost_disable_notify(&net->dev, vq);
550 endtime = busy_clock() + vq->busyloop_timeout;
552 while (vhost_can_busy_poll(&net->dev, endtime) &&
553 !sk_has_rx_data(sk) &&
554 vhost_vq_avail_empty(&net->dev, vq))
559 if (vhost_enable_notify(&net->dev, vq))
560 vhost_poll_queue(&vq->poll);
561 mutex_unlock(&vq->mutex);
563 len = peek_head_len(sk);
569 /* This is a multi-buffer version of vhost_get_desc, that works if
570 * vq has read descriptors only.
571 * @vq - the relevant virtqueue
572 * @datalen - data length we'll be reading
573 * @iovcount - returned count of io vectors we fill
575 * @log_num - log offset
576 * @quota - headcount quota, 1 for big buffer
577 * returns number of buffer heads allocated, negative on error
579 static int get_rx_bufs(struct vhost_virtqueue *vq,
580 struct vring_used_elem *heads,
583 struct vhost_log *log,
587 unsigned int out, in;
592 /* len is always initialized before use since we are always called with
595 u32 uninitialized_var(len);
597 while (datalen > 0 && headcount < quota) {
598 if (unlikely(seg >= UIO_MAXIOV)) {
602 r = vhost_get_vq_desc(vq, vq->iov + seg,
603 ARRAY_SIZE(vq->iov) - seg, &out,
613 if (unlikely(out || in <= 0)) {
614 vq_err(vq, "unexpected descriptor format for RX: "
615 "out %d, in %d\n", out, in);
623 heads[headcount].id = cpu_to_vhost32(vq, d);
624 len = iov_length(vq->iov + seg, in);
625 heads[headcount].len = cpu_to_vhost32(vq, len);
630 heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
636 if (unlikely(datalen > 0)) {
642 vhost_discard_vq_desc(vq, headcount);
646 /* Expects to be always run from workqueue - which acts as
647 * read-size critical section for our kind of RCU. */
648 static void handle_rx(struct vhost_net *net)
650 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
651 struct vhost_virtqueue *vq = &nvq->vq;
652 unsigned uninitialized_var(in), log;
653 struct vhost_log *vq_log;
654 struct msghdr msg = {
657 .msg_control = NULL, /* FIXME: get and handle RX aux data. */
659 .msg_flags = MSG_DONTWAIT,
661 struct virtio_net_hdr hdr = {
663 .gso_type = VIRTIO_NET_HDR_GSO_NONE
665 size_t total_len = 0;
668 size_t vhost_hlen, sock_hlen;
669 size_t vhost_len, sock_len;
671 struct iov_iter fixup;
672 __virtio16 num_buffers;
674 mutex_lock(&vq->mutex);
675 sock = vq->private_data;
679 if (!vq_iotlb_prefetch(vq))
682 vhost_disable_notify(&net->dev, vq);
683 vhost_net_disable_vq(net, vq);
685 vhost_hlen = nvq->vhost_hlen;
686 sock_hlen = nvq->sock_hlen;
688 vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
690 mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
692 while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
693 sock_len += sock_hlen;
694 vhost_len = sock_len + vhost_hlen;
695 headcount = get_rx_bufs(vq, vq->heads, vhost_len,
697 likely(mergeable) ? UIO_MAXIOV : 1);
698 /* On error, stop handling until the next kick. */
699 if (unlikely(headcount < 0))
701 /* On overrun, truncate and discard */
702 if (unlikely(headcount > UIO_MAXIOV)) {
703 iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
704 err = sock->ops->recvmsg(sock, &msg,
705 1, MSG_DONTWAIT | MSG_TRUNC);
706 pr_debug("Discarded rx packet: len %zd\n", sock_len);
709 /* OK, now we need to know about added descriptors. */
711 if (unlikely(vhost_enable_notify(&net->dev, vq))) {
712 /* They have slipped one in as we were
713 * doing that: check again. */
714 vhost_disable_notify(&net->dev, vq);
717 /* Nothing new? Wait for eventfd to tell us
721 /* We don't need to be notified again. */
722 iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
723 fixup = msg.msg_iter;
724 if (unlikely((vhost_hlen))) {
725 /* We will supply the header ourselves
728 iov_iter_advance(&msg.msg_iter, vhost_hlen);
730 err = sock->ops->recvmsg(sock, &msg,
731 sock_len, MSG_DONTWAIT | MSG_TRUNC);
732 /* Userspace might have consumed the packet meanwhile:
733 * it's not supposed to do this usually, but might be hard
734 * to prevent. Discard data we got (if any) and keep going. */
735 if (unlikely(err != sock_len)) {
736 pr_debug("Discarded rx packet: "
737 " len %d, expected %zd\n", err, sock_len);
738 vhost_discard_vq_desc(vq, headcount);
741 /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
742 if (unlikely(vhost_hlen)) {
743 if (copy_to_iter(&hdr, sizeof(hdr),
744 &fixup) != sizeof(hdr)) {
745 vq_err(vq, "Unable to write vnet_hdr "
746 "at addr %p\n", vq->iov->iov_base);
750 /* Header came from socket; we'll need to patch
751 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
753 iov_iter_advance(&fixup, sizeof(hdr));
755 /* TODO: Should check and handle checksum. */
757 num_buffers = cpu_to_vhost16(vq, headcount);
758 if (likely(mergeable) &&
759 copy_to_iter(&num_buffers, sizeof num_buffers,
760 &fixup) != sizeof num_buffers) {
761 vq_err(vq, "Failed num_buffers write");
762 vhost_discard_vq_desc(vq, headcount);
765 vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
767 if (unlikely(vq_log))
768 vhost_log_write(vq, vq_log, log, vhost_len);
769 total_len += vhost_len;
770 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
771 vhost_poll_queue(&vq->poll);
775 vhost_net_enable_vq(net, vq);
777 mutex_unlock(&vq->mutex);
780 static void handle_tx_kick(struct vhost_work *work)
782 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
784 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
789 static void handle_rx_kick(struct vhost_work *work)
791 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
793 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
798 static void handle_tx_net(struct vhost_work *work)
800 struct vhost_net *net = container_of(work, struct vhost_net,
801 poll[VHOST_NET_VQ_TX].work);
805 static void handle_rx_net(struct vhost_work *work)
807 struct vhost_net *net = container_of(work, struct vhost_net,
808 poll[VHOST_NET_VQ_RX].work);
812 static int vhost_net_open(struct inode *inode, struct file *f)
815 struct vhost_dev *dev;
816 struct vhost_virtqueue **vqs;
819 n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
821 n = vmalloc(sizeof *n);
825 vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL);
832 vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
833 vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
834 n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
835 n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
836 for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
837 n->vqs[i].ubufs = NULL;
838 n->vqs[i].ubuf_info = NULL;
839 n->vqs[i].upend_idx = 0;
840 n->vqs[i].done_idx = 0;
841 n->vqs[i].vhost_hlen = 0;
842 n->vqs[i].sock_hlen = 0;
844 vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
846 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
847 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
854 static struct socket *vhost_net_stop_vq(struct vhost_net *n,
855 struct vhost_virtqueue *vq)
859 mutex_lock(&vq->mutex);
860 sock = vq->private_data;
861 vhost_net_disable_vq(n, vq);
862 vq->private_data = NULL;
863 mutex_unlock(&vq->mutex);
867 static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
868 struct socket **rx_sock)
870 *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
871 *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
874 static void vhost_net_flush_vq(struct vhost_net *n, int index)
876 vhost_poll_flush(n->poll + index);
877 vhost_poll_flush(&n->vqs[index].vq.poll);
880 static void vhost_net_flush(struct vhost_net *n)
882 vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
883 vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
884 if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
885 mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
887 mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
888 /* Wait for all lower device DMAs done. */
889 vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
890 mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
892 atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
893 mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
897 static int vhost_net_release(struct inode *inode, struct file *f)
899 struct vhost_net *n = f->private_data;
900 struct socket *tx_sock;
901 struct socket *rx_sock;
903 vhost_net_stop(n, &tx_sock, &rx_sock);
905 vhost_dev_stop(&n->dev);
906 vhost_dev_cleanup(&n->dev, false);
907 vhost_net_vq_reset(n);
912 /* Make sure no callbacks are outstanding */
913 synchronize_rcu_bh();
914 /* We do an extra flush before freeing memory,
915 * since jobs can re-queue themselves. */
922 static struct socket *get_raw_socket(int fd)
925 struct sockaddr_ll sa;
926 char buf[MAX_ADDR_LEN];
928 int uaddr_len = sizeof uaddr, r;
929 struct socket *sock = sockfd_lookup(fd, &r);
932 return ERR_PTR(-ENOTSOCK);
934 /* Parameter checking */
935 if (sock->sk->sk_type != SOCK_RAW) {
936 r = -ESOCKTNOSUPPORT;
940 r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
945 if (uaddr.sa.sll_family != AF_PACKET) {
955 static struct socket *get_tap_socket(int fd)
957 struct file *file = fget(fd);
961 return ERR_PTR(-EBADF);
962 sock = tun_get_socket(file);
965 sock = tap_get_socket(file);
971 static struct socket *get_socket(int fd)
975 /* special case to disable backend */
978 sock = get_raw_socket(fd);
981 sock = get_tap_socket(fd);
984 return ERR_PTR(-ENOTSOCK);
987 static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
989 struct socket *sock, *oldsock;
990 struct vhost_virtqueue *vq;
991 struct vhost_net_virtqueue *nvq;
992 struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL;
995 mutex_lock(&n->dev.mutex);
996 r = vhost_dev_check_owner(&n->dev);
1000 if (index >= VHOST_NET_VQ_MAX) {
1004 vq = &n->vqs[index].vq;
1005 nvq = &n->vqs[index];
1006 mutex_lock(&vq->mutex);
1008 /* Verify that ring has been setup correctly. */
1009 if (!vhost_vq_access_ok(vq)) {
1013 sock = get_socket(fd);
1019 /* start polling new socket */
1020 oldsock = vq->private_data;
1021 if (sock != oldsock) {
1022 ubufs = vhost_net_ubuf_alloc(vq,
1023 sock && vhost_sock_zcopy(sock));
1024 if (IS_ERR(ubufs)) {
1029 vhost_net_disable_vq(n, vq);
1030 vq->private_data = sock;
1031 r = vhost_vq_init_access(vq);
1034 r = vhost_net_enable_vq(n, vq);
1038 oldubufs = nvq->ubufs;
1042 n->tx_zcopy_err = 0;
1043 n->tx_flush = false;
1046 mutex_unlock(&vq->mutex);
1049 vhost_net_ubuf_put_wait_and_free(oldubufs);
1050 mutex_lock(&vq->mutex);
1051 vhost_zerocopy_signal_used(n, vq);
1052 mutex_unlock(&vq->mutex);
1056 vhost_net_flush_vq(n, index);
1057 sockfd_put(oldsock);
1060 mutex_unlock(&n->dev.mutex);
1064 vq->private_data = oldsock;
1065 vhost_net_enable_vq(n, vq);
1067 vhost_net_ubuf_put_wait_and_free(ubufs);
1071 mutex_unlock(&vq->mutex);
1073 mutex_unlock(&n->dev.mutex);
1077 static long vhost_net_reset_owner(struct vhost_net *n)
1079 struct socket *tx_sock = NULL;
1080 struct socket *rx_sock = NULL;
1082 struct vhost_umem *umem;
1084 mutex_lock(&n->dev.mutex);
1085 err = vhost_dev_check_owner(&n->dev);
1088 umem = vhost_dev_reset_owner_prepare();
1093 vhost_net_stop(n, &tx_sock, &rx_sock);
1095 vhost_dev_reset_owner(&n->dev, umem);
1096 vhost_net_vq_reset(n);
1098 mutex_unlock(&n->dev.mutex);
1100 sockfd_put(tx_sock);
1102 sockfd_put(rx_sock);
1106 static int vhost_net_set_features(struct vhost_net *n, u64 features)
1108 size_t vhost_hlen, sock_hlen, hdr_len;
1111 hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
1112 (1ULL << VIRTIO_F_VERSION_1))) ?
1113 sizeof(struct virtio_net_hdr_mrg_rxbuf) :
1114 sizeof(struct virtio_net_hdr);
1115 if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
1116 /* vhost provides vnet_hdr */
1117 vhost_hlen = hdr_len;
1120 /* socket provides vnet_hdr */
1122 sock_hlen = hdr_len;
1124 mutex_lock(&n->dev.mutex);
1125 if ((features & (1 << VHOST_F_LOG_ALL)) &&
1126 !vhost_log_access_ok(&n->dev))
1129 if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
1130 if (vhost_init_device_iotlb(&n->dev, true))
1134 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
1135 mutex_lock(&n->vqs[i].vq.mutex);
1136 n->vqs[i].vq.acked_features = features;
1137 n->vqs[i].vhost_hlen = vhost_hlen;
1138 n->vqs[i].sock_hlen = sock_hlen;
1139 mutex_unlock(&n->vqs[i].vq.mutex);
1141 mutex_unlock(&n->dev.mutex);
1145 mutex_unlock(&n->dev.mutex);
1149 static long vhost_net_set_owner(struct vhost_net *n)
1153 mutex_lock(&n->dev.mutex);
1154 if (vhost_dev_has_owner(&n->dev)) {
1158 r = vhost_net_set_ubuf_info(n);
1161 r = vhost_dev_set_owner(&n->dev);
1163 vhost_net_clear_ubuf_info(n);
1166 mutex_unlock(&n->dev.mutex);
1170 static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
1173 struct vhost_net *n = f->private_data;
1174 void __user *argp = (void __user *)arg;
1175 u64 __user *featurep = argp;
1176 struct vhost_vring_file backend;
1181 case VHOST_NET_SET_BACKEND:
1182 if (copy_from_user(&backend, argp, sizeof backend))
1184 return vhost_net_set_backend(n, backend.index, backend.fd);
1185 case VHOST_GET_FEATURES:
1186 features = VHOST_NET_FEATURES;
1187 if (copy_to_user(featurep, &features, sizeof features))
1190 case VHOST_SET_FEATURES:
1191 if (copy_from_user(&features, featurep, sizeof features))
1193 if (features & ~VHOST_NET_FEATURES)
1195 return vhost_net_set_features(n, features);
1196 case VHOST_RESET_OWNER:
1197 return vhost_net_reset_owner(n);
1198 case VHOST_SET_OWNER:
1199 return vhost_net_set_owner(n);
1201 mutex_lock(&n->dev.mutex);
1202 r = vhost_dev_ioctl(&n->dev, ioctl, argp);
1203 if (r == -ENOIOCTLCMD)
1204 r = vhost_vring_ioctl(&n->dev, ioctl, argp);
1207 mutex_unlock(&n->dev.mutex);
1212 #ifdef CONFIG_COMPAT
1213 static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
1216 return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
1220 static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
1222 struct file *file = iocb->ki_filp;
1223 struct vhost_net *n = file->private_data;
1224 struct vhost_dev *dev = &n->dev;
1225 int noblock = file->f_flags & O_NONBLOCK;
1227 return vhost_chr_read_iter(dev, to, noblock);
1230 static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
1231 struct iov_iter *from)
1233 struct file *file = iocb->ki_filp;
1234 struct vhost_net *n = file->private_data;
1235 struct vhost_dev *dev = &n->dev;
1237 return vhost_chr_write_iter(dev, from);
1240 static unsigned int vhost_net_chr_poll(struct file *file, poll_table *wait)
1242 struct vhost_net *n = file->private_data;
1243 struct vhost_dev *dev = &n->dev;
1245 return vhost_chr_poll(file, dev, wait);
1248 static const struct file_operations vhost_net_fops = {
1249 .owner = THIS_MODULE,
1250 .release = vhost_net_release,
1251 .read_iter = vhost_net_chr_read_iter,
1252 .write_iter = vhost_net_chr_write_iter,
1253 .poll = vhost_net_chr_poll,
1254 .unlocked_ioctl = vhost_net_ioctl,
1255 #ifdef CONFIG_COMPAT
1256 .compat_ioctl = vhost_net_compat_ioctl,
1258 .open = vhost_net_open,
1259 .llseek = noop_llseek,
1262 static struct miscdevice vhost_net_misc = {
1263 .minor = VHOST_NET_MINOR,
1264 .name = "vhost-net",
1265 .fops = &vhost_net_fops,
1268 static int vhost_net_init(void)
1270 if (experimental_zcopytx)
1271 vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
1272 return misc_register(&vhost_net_misc);
1274 module_init(vhost_net_init);
1276 static void vhost_net_exit(void)
1278 misc_deregister(&vhost_net_misc);
1280 module_exit(vhost_net_exit);
1282 MODULE_VERSION("0.0.1");
1283 MODULE_LICENSE("GPL v2");
1284 MODULE_AUTHOR("Michael S. Tsirkin");
1285 MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
1286 MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
1287 MODULE_ALIAS("devname:vhost-net");