1 // SPDX-License-Identifier: GPL-2.0-only
3 * VDUSE: vDPA Device in Userspace
5 * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
7 * Author: Xie Yongji <xieyongji@bytedance.com>
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/cdev.h>
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/slab.h>
17 #include <linux/wait.h>
18 #include <linux/dma-map-ops.h>
19 #include <linux/poll.h>
20 #include <linux/file.h>
21 #include <linux/uio.h>
22 #include <linux/vdpa.h>
23 #include <linux/nospec.h>
24 #include <uapi/linux/vduse.h>
25 #include <uapi/linux/vdpa.h>
26 #include <uapi/linux/virtio_config.h>
27 #include <uapi/linux/virtio_ids.h>
28 #include <uapi/linux/virtio_blk.h>
29 #include <linux/mod_devicetable.h>
31 #include "iova_domain.h"
33 #define DRV_AUTHOR "Yongji Xie <xieyongji@bytedance.com>"
34 #define DRV_DESC "vDPA Device in Userspace"
35 #define DRV_LICENSE "GPL v2"
37 #define VDUSE_DEV_MAX (1U << MINORBITS)
38 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
39 #define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
40 #define VDUSE_MSG_DEFAULT_TIMEOUT 30
42 struct vduse_virtqueue {
49 struct vdpa_vq_state state;
54 struct eventfd_ctx *kickfd;
55 struct vdpa_callback cb;
56 struct work_struct inject;
57 struct work_struct kick;
63 struct vdpa_device vdpa;
64 struct vduse_dev *dev;
68 struct vduse_vdpa *vdev;
70 struct vduse_virtqueue *vqs;
71 struct vduse_iova_domain *domain;
77 wait_queue_head_t waitq;
78 struct list_head send_list;
79 struct list_head recv_list;
80 struct vdpa_callback config_cb;
81 struct work_struct inject;
83 struct rw_semaphore rwsem;
100 struct vduse_dev_msg {
101 struct vduse_dev_request req;
102 struct vduse_dev_response resp;
103 struct list_head list;
104 wait_queue_head_t waitq;
108 struct vduse_control {
112 static DEFINE_MUTEX(vduse_lock);
113 static DEFINE_IDR(vduse_idr);
115 static dev_t vduse_major;
116 static struct class *vduse_class;
117 static struct cdev vduse_ctrl_cdev;
118 static struct cdev vduse_cdev;
119 static struct workqueue_struct *vduse_irq_wq;
121 static u32 allowed_device_id[] = {
125 static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
127 struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
132 static inline struct vduse_dev *dev_to_vduse(struct device *dev)
134 struct vdpa_device *vdpa = dev_to_vdpa(dev);
136 return vdpa_to_vduse(vdpa);
139 static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
142 struct vduse_dev_msg *msg;
144 list_for_each_entry(msg, head, list) {
145 if (msg->req.request_id == request_id) {
146 list_del(&msg->list);
154 static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
156 struct vduse_dev_msg *msg = NULL;
158 if (!list_empty(head)) {
159 msg = list_first_entry(head, struct vduse_dev_msg, list);
160 list_del(&msg->list);
166 static void vduse_enqueue_msg(struct list_head *head,
167 struct vduse_dev_msg *msg)
169 list_add_tail(&msg->list, head);
172 static void vduse_dev_broken(struct vduse_dev *dev)
174 struct vduse_dev_msg *msg, *tmp;
176 if (unlikely(dev->broken))
179 list_splice_init(&dev->recv_list, &dev->send_list);
180 list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
181 list_del(&msg->list);
183 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
184 wake_up(&msg->waitq);
187 wake_up(&dev->waitq);
190 static int vduse_dev_msg_sync(struct vduse_dev *dev,
191 struct vduse_dev_msg *msg)
195 if (unlikely(dev->broken))
198 init_waitqueue_head(&msg->waitq);
199 spin_lock(&dev->msg_lock);
200 if (unlikely(dev->broken)) {
201 spin_unlock(&dev->msg_lock);
204 msg->req.request_id = dev->msg_unique++;
205 vduse_enqueue_msg(&dev->send_list, msg);
206 wake_up(&dev->waitq);
207 spin_unlock(&dev->msg_lock);
208 if (dev->msg_timeout)
209 ret = wait_event_killable_timeout(msg->waitq, msg->completed,
210 (long)dev->msg_timeout * HZ);
212 ret = wait_event_killable(msg->waitq, msg->completed);
214 spin_lock(&dev->msg_lock);
215 if (!msg->completed) {
216 list_del(&msg->list);
217 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
218 /* Mark the device as malfunction when there is a timeout */
220 vduse_dev_broken(dev);
222 ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
223 spin_unlock(&dev->msg_lock);
228 static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
229 struct vduse_virtqueue *vq,
230 struct vdpa_vq_state_packed *packed)
232 struct vduse_dev_msg msg = { 0 };
235 msg.req.type = VDUSE_GET_VQ_STATE;
236 msg.req.vq_state.index = vq->index;
238 ret = vduse_dev_msg_sync(dev, &msg);
242 packed->last_avail_counter =
243 msg.resp.vq_state.packed.last_avail_counter & 0x0001;
244 packed->last_avail_idx =
245 msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
246 packed->last_used_counter =
247 msg.resp.vq_state.packed.last_used_counter & 0x0001;
248 packed->last_used_idx =
249 msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
254 static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
255 struct vduse_virtqueue *vq,
256 struct vdpa_vq_state_split *split)
258 struct vduse_dev_msg msg = { 0 };
261 msg.req.type = VDUSE_GET_VQ_STATE;
262 msg.req.vq_state.index = vq->index;
264 ret = vduse_dev_msg_sync(dev, &msg);
268 split->avail_index = msg.resp.vq_state.split.avail_index;
273 static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
275 struct vduse_dev_msg msg = { 0 };
277 msg.req.type = VDUSE_SET_STATUS;
278 msg.req.s.status = status;
280 return vduse_dev_msg_sync(dev, &msg);
283 static int vduse_dev_update_iotlb(struct vduse_dev *dev,
286 struct vduse_dev_msg msg = { 0 };
291 msg.req.type = VDUSE_UPDATE_IOTLB;
292 msg.req.iova.start = start;
293 msg.req.iova.last = last;
295 return vduse_dev_msg_sync(dev, &msg);
298 static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
300 struct file *file = iocb->ki_filp;
301 struct vduse_dev *dev = file->private_data;
302 struct vduse_dev_msg *msg;
303 int size = sizeof(struct vduse_dev_request);
306 if (iov_iter_count(to) < size)
309 spin_lock(&dev->msg_lock);
311 msg = vduse_dequeue_msg(&dev->send_list);
316 if (file->f_flags & O_NONBLOCK)
319 spin_unlock(&dev->msg_lock);
320 ret = wait_event_interruptible_exclusive(dev->waitq,
321 !list_empty(&dev->send_list));
325 spin_lock(&dev->msg_lock);
327 spin_unlock(&dev->msg_lock);
328 ret = copy_to_iter(&msg->req, size, to);
329 spin_lock(&dev->msg_lock);
332 vduse_enqueue_msg(&dev->send_list, msg);
335 vduse_enqueue_msg(&dev->recv_list, msg);
337 spin_unlock(&dev->msg_lock);
342 static bool is_mem_zero(const char *ptr, int size)
346 for (i = 0; i < size; i++) {
353 static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
355 struct file *file = iocb->ki_filp;
356 struct vduse_dev *dev = file->private_data;
357 struct vduse_dev_response resp;
358 struct vduse_dev_msg *msg;
361 ret = copy_from_iter(&resp, sizeof(resp), from);
362 if (ret != sizeof(resp))
365 if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
368 spin_lock(&dev->msg_lock);
369 msg = vduse_find_msg(&dev->recv_list, resp.request_id);
375 memcpy(&msg->resp, &resp, sizeof(resp));
377 wake_up(&msg->waitq);
379 spin_unlock(&dev->msg_lock);
384 static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
386 struct vduse_dev *dev = file->private_data;
389 poll_wait(file, &dev->waitq, wait);
391 spin_lock(&dev->msg_lock);
393 if (unlikely(dev->broken))
395 if (!list_empty(&dev->send_list))
396 mask |= EPOLLIN | EPOLLRDNORM;
397 if (!list_empty(&dev->recv_list))
398 mask |= EPOLLOUT | EPOLLWRNORM;
400 spin_unlock(&dev->msg_lock);
405 static void vduse_dev_reset(struct vduse_dev *dev)
408 struct vduse_iova_domain *domain = dev->domain;
410 /* The coherent mappings are handled in vduse_dev_free_coherent() */
411 if (domain->bounce_map)
412 vduse_domain_reset_bounce_map(domain);
414 down_write(&dev->rwsem);
417 dev->driver_features = 0;
419 spin_lock(&dev->irq_lock);
420 dev->config_cb.callback = NULL;
421 dev->config_cb.private = NULL;
422 spin_unlock(&dev->irq_lock);
423 flush_work(&dev->inject);
425 for (i = 0; i < dev->vq_num; i++) {
426 struct vduse_virtqueue *vq = &dev->vqs[i];
433 memset(&vq->state, 0, sizeof(vq->state));
435 spin_lock(&vq->kick_lock);
438 eventfd_ctx_put(vq->kickfd);
440 spin_unlock(&vq->kick_lock);
442 spin_lock(&vq->irq_lock);
443 vq->cb.callback = NULL;
444 vq->cb.private = NULL;
445 spin_unlock(&vq->irq_lock);
446 flush_work(&vq->inject);
447 flush_work(&vq->kick);
450 up_write(&dev->rwsem);
453 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
454 u64 desc_area, u64 driver_area,
457 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
458 struct vduse_virtqueue *vq = &dev->vqs[idx];
460 vq->desc_addr = desc_area;
461 vq->driver_addr = driver_area;
462 vq->device_addr = device_area;
467 static void vduse_vq_kick(struct vduse_virtqueue *vq)
469 spin_lock(&vq->kick_lock);
474 eventfd_signal(vq->kickfd, 1);
478 spin_unlock(&vq->kick_lock);
481 static void vduse_vq_kick_work(struct work_struct *work)
483 struct vduse_virtqueue *vq = container_of(work,
484 struct vduse_virtqueue, kick);
489 static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
491 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
492 struct vduse_virtqueue *vq = &dev->vqs[idx];
494 if (!eventfd_signal_allowed()) {
495 schedule_work(&vq->kick);
501 static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
502 struct vdpa_callback *cb)
504 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
505 struct vduse_virtqueue *vq = &dev->vqs[idx];
507 spin_lock(&vq->irq_lock);
508 vq->cb.callback = cb->callback;
509 vq->cb.private = cb->private;
510 spin_unlock(&vq->irq_lock);
513 static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
515 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
516 struct vduse_virtqueue *vq = &dev->vqs[idx];
521 static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
524 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
525 struct vduse_virtqueue *vq = &dev->vqs[idx];
530 static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
532 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
533 struct vduse_virtqueue *vq = &dev->vqs[idx];
538 static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
539 const struct vdpa_vq_state *state)
541 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
542 struct vduse_virtqueue *vq = &dev->vqs[idx];
544 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
545 vq->state.packed.last_avail_counter =
546 state->packed.last_avail_counter;
547 vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
548 vq->state.packed.last_used_counter =
549 state->packed.last_used_counter;
550 vq->state.packed.last_used_idx = state->packed.last_used_idx;
552 vq->state.split.avail_index = state->split.avail_index;
557 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
558 struct vdpa_vq_state *state)
560 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
561 struct vduse_virtqueue *vq = &dev->vqs[idx];
563 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
564 return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
566 return vduse_dev_get_vq_state_split(dev, vq, &state->split);
569 static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
571 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
573 return dev->vq_align;
576 static u64 vduse_vdpa_get_features(struct vdpa_device *vdpa)
578 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
580 return dev->device_features;
583 static int vduse_vdpa_set_features(struct vdpa_device *vdpa, u64 features)
585 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
587 dev->driver_features = features;
591 static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
592 struct vdpa_callback *cb)
594 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
596 spin_lock(&dev->irq_lock);
597 dev->config_cb.callback = cb->callback;
598 dev->config_cb.private = cb->private;
599 spin_unlock(&dev->irq_lock);
602 static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
604 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
608 for (i = 0; i < dev->vq_num; i++)
609 if (num_max < dev->vqs[i].num_max)
610 num_max = dev->vqs[i].num_max;
615 static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
617 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
619 return dev->device_id;
622 static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
624 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
626 return dev->vendor_id;
629 static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
631 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
636 static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
638 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
640 if (vduse_dev_set_status(dev, status))
643 dev->status = status;
646 static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
648 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
650 return dev->config_size;
653 static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
654 void *buf, unsigned int len)
656 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
658 if (offset > dev->config_size ||
659 len > dev->config_size - offset)
662 memcpy(buf, dev->config + offset, len);
665 static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
666 const void *buf, unsigned int len)
668 /* Now we only support read-only configuration space */
671 static int vduse_vdpa_reset(struct vdpa_device *vdpa)
673 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
674 int ret = vduse_dev_set_status(dev, 0);
676 vduse_dev_reset(dev);
681 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
683 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
685 return dev->generation;
688 static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
689 struct vhost_iotlb *iotlb)
691 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
694 ret = vduse_domain_set_map(dev->domain, iotlb);
698 ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
700 vduse_domain_clear_map(dev->domain, iotlb);
707 static void vduse_vdpa_free(struct vdpa_device *vdpa)
709 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
714 static const struct vdpa_config_ops vduse_vdpa_config_ops = {
715 .set_vq_address = vduse_vdpa_set_vq_address,
716 .kick_vq = vduse_vdpa_kick_vq,
717 .set_vq_cb = vduse_vdpa_set_vq_cb,
718 .set_vq_num = vduse_vdpa_set_vq_num,
719 .set_vq_ready = vduse_vdpa_set_vq_ready,
720 .get_vq_ready = vduse_vdpa_get_vq_ready,
721 .set_vq_state = vduse_vdpa_set_vq_state,
722 .get_vq_state = vduse_vdpa_get_vq_state,
723 .get_vq_align = vduse_vdpa_get_vq_align,
724 .get_features = vduse_vdpa_get_features,
725 .set_features = vduse_vdpa_set_features,
726 .set_config_cb = vduse_vdpa_set_config_cb,
727 .get_vq_num_max = vduse_vdpa_get_vq_num_max,
728 .get_device_id = vduse_vdpa_get_device_id,
729 .get_vendor_id = vduse_vdpa_get_vendor_id,
730 .get_status = vduse_vdpa_get_status,
731 .set_status = vduse_vdpa_set_status,
732 .get_config_size = vduse_vdpa_get_config_size,
733 .get_config = vduse_vdpa_get_config,
734 .set_config = vduse_vdpa_set_config,
735 .get_generation = vduse_vdpa_get_generation,
736 .reset = vduse_vdpa_reset,
737 .set_map = vduse_vdpa_set_map,
738 .free = vduse_vdpa_free,
741 static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
742 unsigned long offset, size_t size,
743 enum dma_data_direction dir,
746 struct vduse_dev *vdev = dev_to_vduse(dev);
747 struct vduse_iova_domain *domain = vdev->domain;
749 return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
752 static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
753 size_t size, enum dma_data_direction dir,
756 struct vduse_dev *vdev = dev_to_vduse(dev);
757 struct vduse_iova_domain *domain = vdev->domain;
759 return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
762 static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
763 dma_addr_t *dma_addr, gfp_t flag,
766 struct vduse_dev *vdev = dev_to_vduse(dev);
767 struct vduse_iova_domain *domain = vdev->domain;
771 *dma_addr = DMA_MAPPING_ERROR;
772 addr = vduse_domain_alloc_coherent(domain, size,
773 (dma_addr_t *)&iova, flag, attrs);
777 *dma_addr = (dma_addr_t)iova;
782 static void vduse_dev_free_coherent(struct device *dev, size_t size,
783 void *vaddr, dma_addr_t dma_addr,
786 struct vduse_dev *vdev = dev_to_vduse(dev);
787 struct vduse_iova_domain *domain = vdev->domain;
789 vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
792 static size_t vduse_dev_max_mapping_size(struct device *dev)
794 struct vduse_dev *vdev = dev_to_vduse(dev);
795 struct vduse_iova_domain *domain = vdev->domain;
797 return domain->bounce_size;
800 static const struct dma_map_ops vduse_dev_dma_ops = {
801 .map_page = vduse_dev_map_page,
802 .unmap_page = vduse_dev_unmap_page,
803 .alloc = vduse_dev_alloc_coherent,
804 .free = vduse_dev_free_coherent,
805 .max_mapping_size = vduse_dev_max_mapping_size,
808 static unsigned int perm_to_file_flags(u8 perm)
810 unsigned int flags = 0;
813 case VDUSE_ACCESS_WO:
816 case VDUSE_ACCESS_RO:
819 case VDUSE_ACCESS_RW:
823 WARN(1, "invalidate vhost IOTLB permission\n");
830 static int vduse_kickfd_setup(struct vduse_dev *dev,
831 struct vduse_vq_eventfd *eventfd)
833 struct eventfd_ctx *ctx = NULL;
834 struct vduse_virtqueue *vq;
837 if (eventfd->index >= dev->vq_num)
840 index = array_index_nospec(eventfd->index, dev->vq_num);
841 vq = &dev->vqs[index];
842 if (eventfd->fd >= 0) {
843 ctx = eventfd_ctx_fdget(eventfd->fd);
846 } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
849 spin_lock(&vq->kick_lock);
851 eventfd_ctx_put(vq->kickfd);
853 if (vq->ready && vq->kicked && vq->kickfd) {
854 eventfd_signal(vq->kickfd, 1);
857 spin_unlock(&vq->kick_lock);
862 static bool vduse_dev_is_ready(struct vduse_dev *dev)
866 for (i = 0; i < dev->vq_num; i++)
867 if (!dev->vqs[i].num_max)
873 static void vduse_dev_irq_inject(struct work_struct *work)
875 struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
877 spin_lock_irq(&dev->irq_lock);
878 if (dev->config_cb.callback)
879 dev->config_cb.callback(dev->config_cb.private);
880 spin_unlock_irq(&dev->irq_lock);
883 static void vduse_vq_irq_inject(struct work_struct *work)
885 struct vduse_virtqueue *vq = container_of(work,
886 struct vduse_virtqueue, inject);
888 spin_lock_irq(&vq->irq_lock);
889 if (vq->ready && vq->cb.callback)
890 vq->cb.callback(vq->cb.private);
891 spin_unlock_irq(&vq->irq_lock);
894 static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
895 struct work_struct *irq_work)
899 down_read(&dev->rwsem);
900 if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
904 queue_work(vduse_irq_wq, irq_work);
906 up_read(&dev->rwsem);
911 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
914 struct vduse_dev *dev = file->private_data;
915 void __user *argp = (void __user *)arg;
918 if (unlikely(dev->broken))
922 case VDUSE_IOTLB_GET_FD: {
923 struct vduse_iotlb_entry entry;
924 struct vhost_iotlb_map *map;
925 struct vdpa_map_file *map_file;
926 struct vduse_iova_domain *domain = dev->domain;
927 struct file *f = NULL;
930 if (copy_from_user(&entry, argp, sizeof(entry)))
934 if (entry.start > entry.last)
937 spin_lock(&domain->iotlb_lock);
938 map = vhost_iotlb_itree_first(domain->iotlb,
939 entry.start, entry.last);
941 map_file = (struct vdpa_map_file *)map->opaque;
942 f = get_file(map_file->file);
943 entry.offset = map_file->offset;
944 entry.start = map->start;
945 entry.last = map->last;
946 entry.perm = map->perm;
948 spin_unlock(&domain->iotlb_lock);
954 if (copy_to_user(argp, &entry, sizeof(entry))) {
958 ret = receive_fd(f, perm_to_file_flags(entry.perm));
962 case VDUSE_DEV_GET_FEATURES:
964 * Just mirror what driver wrote here.
965 * The driver is expected to check FEATURE_OK later.
967 ret = put_user(dev->driver_features, (u64 __user *)argp);
969 case VDUSE_DEV_SET_CONFIG: {
970 struct vduse_config_data config;
971 unsigned long size = offsetof(struct vduse_config_data,
975 if (copy_from_user(&config, argp, size))
979 if (config.offset > dev->config_size ||
980 config.length == 0 ||
981 config.length > dev->config_size - config.offset)
985 if (copy_from_user(dev->config + config.offset, argp + size,
992 case VDUSE_DEV_INJECT_CONFIG_IRQ:
993 ret = vduse_dev_queue_irq_work(dev, &dev->inject);
995 case VDUSE_VQ_SETUP: {
996 struct vduse_vq_config config;
1000 if (copy_from_user(&config, argp, sizeof(config)))
1004 if (config.index >= dev->vq_num)
1007 if (!is_mem_zero((const char *)config.reserved,
1008 sizeof(config.reserved)))
1011 index = array_index_nospec(config.index, dev->vq_num);
1012 dev->vqs[index].num_max = config.max_size;
1016 case VDUSE_VQ_GET_INFO: {
1017 struct vduse_vq_info vq_info;
1018 struct vduse_virtqueue *vq;
1022 if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
1026 if (vq_info.index >= dev->vq_num)
1029 index = array_index_nospec(vq_info.index, dev->vq_num);
1030 vq = &dev->vqs[index];
1031 vq_info.desc_addr = vq->desc_addr;
1032 vq_info.driver_addr = vq->driver_addr;
1033 vq_info.device_addr = vq->device_addr;
1034 vq_info.num = vq->num;
1036 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
1037 vq_info.packed.last_avail_counter =
1038 vq->state.packed.last_avail_counter;
1039 vq_info.packed.last_avail_idx =
1040 vq->state.packed.last_avail_idx;
1041 vq_info.packed.last_used_counter =
1042 vq->state.packed.last_used_counter;
1043 vq_info.packed.last_used_idx =
1044 vq->state.packed.last_used_idx;
1046 vq_info.split.avail_index =
1047 vq->state.split.avail_index;
1049 vq_info.ready = vq->ready;
1052 if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
1058 case VDUSE_VQ_SETUP_KICKFD: {
1059 struct vduse_vq_eventfd eventfd;
1062 if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
1065 ret = vduse_kickfd_setup(dev, &eventfd);
1068 case VDUSE_VQ_INJECT_IRQ: {
1072 if (get_user(index, (u32 __user *)argp))
1076 if (index >= dev->vq_num)
1079 index = array_index_nospec(index, dev->vq_num);
1080 ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
1091 static int vduse_dev_release(struct inode *inode, struct file *file)
1093 struct vduse_dev *dev = file->private_data;
1095 spin_lock(&dev->msg_lock);
1096 /* Make sure the inflight messages can processed after reconncection */
1097 list_splice_init(&dev->recv_list, &dev->send_list);
1098 spin_unlock(&dev->msg_lock);
1099 dev->connected = false;
1104 static struct vduse_dev *vduse_dev_get_from_minor(int minor)
1106 struct vduse_dev *dev;
1108 mutex_lock(&vduse_lock);
1109 dev = idr_find(&vduse_idr, minor);
1110 mutex_unlock(&vduse_lock);
1115 static int vduse_dev_open(struct inode *inode, struct file *file)
1118 struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
1124 mutex_lock(&dev->lock);
1129 dev->connected = true;
1130 file->private_data = dev;
1132 mutex_unlock(&dev->lock);
1137 static const struct file_operations vduse_dev_fops = {
1138 .owner = THIS_MODULE,
1139 .open = vduse_dev_open,
1140 .release = vduse_dev_release,
1141 .read_iter = vduse_dev_read_iter,
1142 .write_iter = vduse_dev_write_iter,
1143 .poll = vduse_dev_poll,
1144 .unlocked_ioctl = vduse_dev_ioctl,
1145 .compat_ioctl = compat_ptr_ioctl,
1146 .llseek = noop_llseek,
1149 static struct vduse_dev *vduse_dev_create(void)
1151 struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1156 mutex_init(&dev->lock);
1157 spin_lock_init(&dev->msg_lock);
1158 INIT_LIST_HEAD(&dev->send_list);
1159 INIT_LIST_HEAD(&dev->recv_list);
1160 spin_lock_init(&dev->irq_lock);
1161 init_rwsem(&dev->rwsem);
1163 INIT_WORK(&dev->inject, vduse_dev_irq_inject);
1164 init_waitqueue_head(&dev->waitq);
1169 static void vduse_dev_destroy(struct vduse_dev *dev)
1174 static struct vduse_dev *vduse_find_dev(const char *name)
1176 struct vduse_dev *dev;
1179 idr_for_each_entry(&vduse_idr, dev, id)
1180 if (!strcmp(dev->name, name))
1186 static int vduse_destroy_dev(char *name)
1188 struct vduse_dev *dev = vduse_find_dev(name);
1193 mutex_lock(&dev->lock);
1194 if (dev->vdev || dev->connected) {
1195 mutex_unlock(&dev->lock);
1198 dev->connected = true;
1199 mutex_unlock(&dev->lock);
1201 vduse_dev_reset(dev);
1202 device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1203 idr_remove(&vduse_idr, dev->minor);
1204 kvfree(dev->config);
1206 vduse_domain_destroy(dev->domain);
1208 vduse_dev_destroy(dev);
1209 module_put(THIS_MODULE);
1214 static bool device_is_allowed(u32 device_id)
1218 for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
1219 if (allowed_device_id[i] == device_id)
1225 static bool features_is_valid(u64 features)
1227 if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
1230 /* Now we only support read-only configuration space */
1231 if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
1237 static bool vduse_validate_config(struct vduse_dev_config *config)
1239 if (!is_mem_zero((const char *)config->reserved,
1240 sizeof(config->reserved)))
1243 if (config->vq_align > PAGE_SIZE)
1246 if (config->config_size > PAGE_SIZE)
1249 if (!device_is_allowed(config->device_id))
1252 if (!features_is_valid(config->features))
1258 static ssize_t msg_timeout_show(struct device *device,
1259 struct device_attribute *attr, char *buf)
1261 struct vduse_dev *dev = dev_get_drvdata(device);
1263 return sysfs_emit(buf, "%u\n", dev->msg_timeout);
1266 static ssize_t msg_timeout_store(struct device *device,
1267 struct device_attribute *attr,
1268 const char *buf, size_t count)
1270 struct vduse_dev *dev = dev_get_drvdata(device);
1273 ret = kstrtouint(buf, 10, &dev->msg_timeout);
1280 static DEVICE_ATTR_RW(msg_timeout);
1282 static struct attribute *vduse_dev_attrs[] = {
1283 &dev_attr_msg_timeout.attr,
1287 ATTRIBUTE_GROUPS(vduse_dev);
1289 static int vduse_create_dev(struct vduse_dev_config *config,
1290 void *config_buf, u64 api_version)
1293 struct vduse_dev *dev;
1296 if (vduse_find_dev(config->name))
1300 dev = vduse_dev_create();
1304 dev->api_version = api_version;
1305 dev->device_features = config->features;
1306 dev->device_id = config->device_id;
1307 dev->vendor_id = config->vendor_id;
1308 dev->name = kstrdup(config->name, GFP_KERNEL);
1312 dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
1317 dev->config = config_buf;
1318 dev->config_size = config->config_size;
1319 dev->vq_align = config->vq_align;
1320 dev->vq_num = config->vq_num;
1321 dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
1325 for (i = 0; i < dev->vq_num; i++) {
1326 dev->vqs[i].index = i;
1327 INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
1328 INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
1329 spin_lock_init(&dev->vqs[i].kick_lock);
1330 spin_lock_init(&dev->vqs[i].irq_lock);
1333 ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
1338 dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
1339 dev->dev = device_create(vduse_class, NULL,
1340 MKDEV(MAJOR(vduse_major), dev->minor),
1341 dev, "%s", config->name);
1342 if (IS_ERR(dev->dev)) {
1343 ret = PTR_ERR(dev->dev);
1346 __module_get(THIS_MODULE);
1350 idr_remove(&vduse_idr, dev->minor);
1354 vduse_domain_destroy(dev->domain);
1358 vduse_dev_destroy(dev);
1364 static long vduse_ioctl(struct file *file, unsigned int cmd,
1368 void __user *argp = (void __user *)arg;
1369 struct vduse_control *control = file->private_data;
1371 mutex_lock(&vduse_lock);
1373 case VDUSE_GET_API_VERSION:
1374 ret = put_user(control->api_version, (u64 __user *)argp);
1376 case VDUSE_SET_API_VERSION: {
1380 if (get_user(api_version, (u64 __user *)argp))
1384 if (api_version > VDUSE_API_VERSION)
1388 control->api_version = api_version;
1391 case VDUSE_CREATE_DEV: {
1392 struct vduse_dev_config config;
1393 unsigned long size = offsetof(struct vduse_dev_config, config);
1397 if (copy_from_user(&config, argp, size))
1401 if (vduse_validate_config(&config) == false)
1404 buf = vmemdup_user(argp + size, config.config_size);
1409 config.name[VDUSE_NAME_MAX - 1] = '\0';
1410 ret = vduse_create_dev(&config, buf, control->api_version);
1413 case VDUSE_DESTROY_DEV: {
1414 char name[VDUSE_NAME_MAX];
1417 if (copy_from_user(name, argp, VDUSE_NAME_MAX))
1420 name[VDUSE_NAME_MAX - 1] = '\0';
1421 ret = vduse_destroy_dev(name);
1428 mutex_unlock(&vduse_lock);
1433 static int vduse_release(struct inode *inode, struct file *file)
1435 struct vduse_control *control = file->private_data;
1441 static int vduse_open(struct inode *inode, struct file *file)
1443 struct vduse_control *control;
1445 control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
1449 control->api_version = VDUSE_API_VERSION;
1450 file->private_data = control;
1455 static const struct file_operations vduse_ctrl_fops = {
1456 .owner = THIS_MODULE,
1458 .release = vduse_release,
1459 .unlocked_ioctl = vduse_ioctl,
1460 .compat_ioctl = compat_ptr_ioctl,
1461 .llseek = noop_llseek,
1464 static char *vduse_devnode(struct device *dev, umode_t *mode)
1466 return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
1469 static void vduse_mgmtdev_release(struct device *dev)
1473 static struct device vduse_mgmtdev = {
1474 .init_name = "vduse",
1475 .release = vduse_mgmtdev_release,
1478 static struct vdpa_mgmt_dev mgmt_dev;
1480 static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
1482 struct vduse_vdpa *vdev;
1488 vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
1489 &vduse_vdpa_config_ops, name, true);
1491 return PTR_ERR(vdev);
1495 vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
1496 ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
1498 put_device(&vdev->vdpa.dev);
1501 set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
1502 vdev->vdpa.dma_dev = &vdev->vdpa.dev;
1503 vdev->vdpa.mdev = &mgmt_dev;
1508 static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
1509 const struct vdpa_dev_set_config *config)
1511 struct vduse_dev *dev;
1514 mutex_lock(&vduse_lock);
1515 dev = vduse_find_dev(name);
1516 if (!dev || !vduse_dev_is_ready(dev)) {
1517 mutex_unlock(&vduse_lock);
1520 ret = vduse_dev_init_vdpa(dev, name);
1521 mutex_unlock(&vduse_lock);
1525 ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
1527 put_device(&dev->vdev->vdpa.dev);
1534 static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
1536 _vdpa_unregister_device(dev);
1539 static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
1540 .dev_add = vdpa_dev_add,
1541 .dev_del = vdpa_dev_del,
1544 static struct virtio_device_id id_table[] = {
1545 { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
1549 static struct vdpa_mgmt_dev mgmt_dev = {
1550 .device = &vduse_mgmtdev,
1551 .id_table = id_table,
1552 .ops = &vdpa_dev_mgmtdev_ops,
1555 static int vduse_mgmtdev_init(void)
1559 ret = device_register(&vduse_mgmtdev);
1563 ret = vdpa_mgmtdev_register(&mgmt_dev);
1569 device_unregister(&vduse_mgmtdev);
1573 static void vduse_mgmtdev_exit(void)
1575 vdpa_mgmtdev_unregister(&mgmt_dev);
1576 device_unregister(&vduse_mgmtdev);
1579 static int vduse_init(void)
1584 vduse_class = class_create(THIS_MODULE, "vduse");
1585 if (IS_ERR(vduse_class))
1586 return PTR_ERR(vduse_class);
1588 vduse_class->devnode = vduse_devnode;
1589 vduse_class->dev_groups = vduse_dev_groups;
1591 ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
1593 goto err_chardev_region;
1595 /* /dev/vduse/control */
1596 cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
1597 vduse_ctrl_cdev.owner = THIS_MODULE;
1598 ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
1602 dev = device_create(vduse_class, NULL, vduse_major, NULL, "control");
1608 /* /dev/vduse/$DEVICE */
1609 cdev_init(&vduse_cdev, &vduse_dev_fops);
1610 vduse_cdev.owner = THIS_MODULE;
1611 ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
1616 vduse_irq_wq = alloc_workqueue("vduse-irq",
1617 WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
1618 if (!vduse_irq_wq) {
1623 ret = vduse_domain_init();
1627 ret = vduse_mgmtdev_init();
1633 vduse_domain_exit();
1635 destroy_workqueue(vduse_irq_wq);
1637 cdev_del(&vduse_cdev);
1639 device_destroy(vduse_class, vduse_major);
1641 cdev_del(&vduse_ctrl_cdev);
1643 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
1645 class_destroy(vduse_class);
1648 module_init(vduse_init);
1650 static void vduse_exit(void)
1652 vduse_mgmtdev_exit();
1653 vduse_domain_exit();
1654 destroy_workqueue(vduse_irq_wq);
1655 cdev_del(&vduse_cdev);
1656 device_destroy(vduse_class, vduse_major);
1657 cdev_del(&vduse_ctrl_cdev);
1658 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
1659 class_destroy(vduse_class);
1661 module_exit(vduse_exit);
1663 MODULE_LICENSE(DRV_LICENSE);
1664 MODULE_AUTHOR(DRV_AUTHOR);
1665 MODULE_DESCRIPTION(DRV_DESC);