1 // SPDX-License-Identifier: GPL-2.0-only
3 * VDUSE: vDPA Device in Userspace
5 * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
7 * Author: Xie Yongji <xieyongji@bytedance.com>
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/cdev.h>
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/slab.h>
17 #include <linux/wait.h>
18 #include <linux/dma-map-ops.h>
19 #include <linux/poll.h>
20 #include <linux/file.h>
21 #include <linux/uio.h>
22 #include <linux/vdpa.h>
23 #include <linux/nospec.h>
24 #include <uapi/linux/vduse.h>
25 #include <uapi/linux/vdpa.h>
26 #include <uapi/linux/virtio_config.h>
27 #include <uapi/linux/virtio_ids.h>
28 #include <uapi/linux/virtio_blk.h>
29 #include <linux/mod_devicetable.h>
31 #include "iova_domain.h"
33 #define DRV_AUTHOR "Yongji Xie <xieyongji@bytedance.com>"
34 #define DRV_DESC "vDPA Device in Userspace"
35 #define DRV_LICENSE "GPL v2"
37 #define VDUSE_DEV_MAX (1U << MINORBITS)
38 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
39 #define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
40 #define VDUSE_MSG_DEFAULT_TIMEOUT 30
42 struct vduse_virtqueue {
49 struct vdpa_vq_state state;
54 struct eventfd_ctx *kickfd;
55 struct vdpa_callback cb;
56 struct work_struct inject;
57 struct work_struct kick;
63 struct vdpa_device vdpa;
64 struct vduse_dev *dev;
68 struct vduse_vdpa *vdev;
70 struct vduse_virtqueue *vqs;
71 struct vduse_iova_domain *domain;
77 wait_queue_head_t waitq;
78 struct list_head send_list;
79 struct list_head recv_list;
80 struct vdpa_callback config_cb;
81 struct work_struct inject;
99 struct vduse_dev_msg {
100 struct vduse_dev_request req;
101 struct vduse_dev_response resp;
102 struct list_head list;
103 wait_queue_head_t waitq;
107 struct vduse_control {
111 static DEFINE_MUTEX(vduse_lock);
112 static DEFINE_IDR(vduse_idr);
114 static dev_t vduse_major;
115 static struct class *vduse_class;
116 static struct cdev vduse_ctrl_cdev;
117 static struct cdev vduse_cdev;
118 static struct workqueue_struct *vduse_irq_wq;
120 static u32 allowed_device_id[] = {
124 static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
126 struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
131 static inline struct vduse_dev *dev_to_vduse(struct device *dev)
133 struct vdpa_device *vdpa = dev_to_vdpa(dev);
135 return vdpa_to_vduse(vdpa);
138 static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
141 struct vduse_dev_msg *msg;
143 list_for_each_entry(msg, head, list) {
144 if (msg->req.request_id == request_id) {
145 list_del(&msg->list);
153 static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
155 struct vduse_dev_msg *msg = NULL;
157 if (!list_empty(head)) {
158 msg = list_first_entry(head, struct vduse_dev_msg, list);
159 list_del(&msg->list);
165 static void vduse_enqueue_msg(struct list_head *head,
166 struct vduse_dev_msg *msg)
168 list_add_tail(&msg->list, head);
171 static void vduse_dev_broken(struct vduse_dev *dev)
173 struct vduse_dev_msg *msg, *tmp;
175 if (unlikely(dev->broken))
178 list_splice_init(&dev->recv_list, &dev->send_list);
179 list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
180 list_del(&msg->list);
182 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
183 wake_up(&msg->waitq);
186 wake_up(&dev->waitq);
189 static int vduse_dev_msg_sync(struct vduse_dev *dev,
190 struct vduse_dev_msg *msg)
194 if (unlikely(dev->broken))
197 init_waitqueue_head(&msg->waitq);
198 spin_lock(&dev->msg_lock);
199 if (unlikely(dev->broken)) {
200 spin_unlock(&dev->msg_lock);
203 msg->req.request_id = dev->msg_unique++;
204 vduse_enqueue_msg(&dev->send_list, msg);
205 wake_up(&dev->waitq);
206 spin_unlock(&dev->msg_lock);
207 if (dev->msg_timeout)
208 ret = wait_event_killable_timeout(msg->waitq, msg->completed,
209 (long)dev->msg_timeout * HZ);
211 ret = wait_event_killable(msg->waitq, msg->completed);
213 spin_lock(&dev->msg_lock);
214 if (!msg->completed) {
215 list_del(&msg->list);
216 msg->resp.result = VDUSE_REQ_RESULT_FAILED;
217 /* Mark the device as malfunction when there is a timeout */
219 vduse_dev_broken(dev);
221 ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
222 spin_unlock(&dev->msg_lock);
227 static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
228 struct vduse_virtqueue *vq,
229 struct vdpa_vq_state_packed *packed)
231 struct vduse_dev_msg msg = { 0 };
234 msg.req.type = VDUSE_GET_VQ_STATE;
235 msg.req.vq_state.index = vq->index;
237 ret = vduse_dev_msg_sync(dev, &msg);
241 packed->last_avail_counter =
242 msg.resp.vq_state.packed.last_avail_counter & 0x0001;
243 packed->last_avail_idx =
244 msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
245 packed->last_used_counter =
246 msg.resp.vq_state.packed.last_used_counter & 0x0001;
247 packed->last_used_idx =
248 msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
253 static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
254 struct vduse_virtqueue *vq,
255 struct vdpa_vq_state_split *split)
257 struct vduse_dev_msg msg = { 0 };
260 msg.req.type = VDUSE_GET_VQ_STATE;
261 msg.req.vq_state.index = vq->index;
263 ret = vduse_dev_msg_sync(dev, &msg);
267 split->avail_index = msg.resp.vq_state.split.avail_index;
272 static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
274 struct vduse_dev_msg msg = { 0 };
276 msg.req.type = VDUSE_SET_STATUS;
277 msg.req.s.status = status;
279 return vduse_dev_msg_sync(dev, &msg);
282 static int vduse_dev_update_iotlb(struct vduse_dev *dev,
285 struct vduse_dev_msg msg = { 0 };
290 msg.req.type = VDUSE_UPDATE_IOTLB;
291 msg.req.iova.start = start;
292 msg.req.iova.last = last;
294 return vduse_dev_msg_sync(dev, &msg);
297 static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
299 struct file *file = iocb->ki_filp;
300 struct vduse_dev *dev = file->private_data;
301 struct vduse_dev_msg *msg;
302 int size = sizeof(struct vduse_dev_request);
305 if (iov_iter_count(to) < size)
308 spin_lock(&dev->msg_lock);
310 msg = vduse_dequeue_msg(&dev->send_list);
315 if (file->f_flags & O_NONBLOCK)
318 spin_unlock(&dev->msg_lock);
319 ret = wait_event_interruptible_exclusive(dev->waitq,
320 !list_empty(&dev->send_list));
324 spin_lock(&dev->msg_lock);
326 spin_unlock(&dev->msg_lock);
327 ret = copy_to_iter(&msg->req, size, to);
328 spin_lock(&dev->msg_lock);
331 vduse_enqueue_msg(&dev->send_list, msg);
334 vduse_enqueue_msg(&dev->recv_list, msg);
336 spin_unlock(&dev->msg_lock);
341 static bool is_mem_zero(const char *ptr, int size)
345 for (i = 0; i < size; i++) {
352 static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
354 struct file *file = iocb->ki_filp;
355 struct vduse_dev *dev = file->private_data;
356 struct vduse_dev_response resp;
357 struct vduse_dev_msg *msg;
360 ret = copy_from_iter(&resp, sizeof(resp), from);
361 if (ret != sizeof(resp))
364 if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
367 spin_lock(&dev->msg_lock);
368 msg = vduse_find_msg(&dev->recv_list, resp.request_id);
374 memcpy(&msg->resp, &resp, sizeof(resp));
376 wake_up(&msg->waitq);
378 spin_unlock(&dev->msg_lock);
383 static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
385 struct vduse_dev *dev = file->private_data;
388 poll_wait(file, &dev->waitq, wait);
390 spin_lock(&dev->msg_lock);
392 if (unlikely(dev->broken))
394 if (!list_empty(&dev->send_list))
395 mask |= EPOLLIN | EPOLLRDNORM;
396 if (!list_empty(&dev->recv_list))
397 mask |= EPOLLOUT | EPOLLWRNORM;
399 spin_unlock(&dev->msg_lock);
404 static void vduse_dev_reset(struct vduse_dev *dev)
407 struct vduse_iova_domain *domain = dev->domain;
409 /* The coherent mappings are handled in vduse_dev_free_coherent() */
410 if (domain->bounce_map)
411 vduse_domain_reset_bounce_map(domain);
414 dev->driver_features = 0;
416 spin_lock(&dev->irq_lock);
417 dev->config_cb.callback = NULL;
418 dev->config_cb.private = NULL;
419 spin_unlock(&dev->irq_lock);
420 flush_work(&dev->inject);
422 for (i = 0; i < dev->vq_num; i++) {
423 struct vduse_virtqueue *vq = &dev->vqs[i];
430 memset(&vq->state, 0, sizeof(vq->state));
432 spin_lock(&vq->kick_lock);
435 eventfd_ctx_put(vq->kickfd);
437 spin_unlock(&vq->kick_lock);
439 spin_lock(&vq->irq_lock);
440 vq->cb.callback = NULL;
441 vq->cb.private = NULL;
442 spin_unlock(&vq->irq_lock);
443 flush_work(&vq->inject);
444 flush_work(&vq->kick);
448 static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
449 u64 desc_area, u64 driver_area,
452 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
453 struct vduse_virtqueue *vq = &dev->vqs[idx];
455 vq->desc_addr = desc_area;
456 vq->driver_addr = driver_area;
457 vq->device_addr = device_area;
462 static void vduse_vq_kick(struct vduse_virtqueue *vq)
464 spin_lock(&vq->kick_lock);
469 eventfd_signal(vq->kickfd, 1);
473 spin_unlock(&vq->kick_lock);
476 static void vduse_vq_kick_work(struct work_struct *work)
478 struct vduse_virtqueue *vq = container_of(work,
479 struct vduse_virtqueue, kick);
484 static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
486 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
487 struct vduse_virtqueue *vq = &dev->vqs[idx];
489 if (!eventfd_signal_allowed()) {
490 schedule_work(&vq->kick);
496 static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
497 struct vdpa_callback *cb)
499 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
500 struct vduse_virtqueue *vq = &dev->vqs[idx];
502 spin_lock(&vq->irq_lock);
503 vq->cb.callback = cb->callback;
504 vq->cb.private = cb->private;
505 spin_unlock(&vq->irq_lock);
508 static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
510 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
511 struct vduse_virtqueue *vq = &dev->vqs[idx];
516 static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
519 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
520 struct vduse_virtqueue *vq = &dev->vqs[idx];
525 static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
527 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
528 struct vduse_virtqueue *vq = &dev->vqs[idx];
533 static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
534 const struct vdpa_vq_state *state)
536 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
537 struct vduse_virtqueue *vq = &dev->vqs[idx];
539 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
540 vq->state.packed.last_avail_counter =
541 state->packed.last_avail_counter;
542 vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
543 vq->state.packed.last_used_counter =
544 state->packed.last_used_counter;
545 vq->state.packed.last_used_idx = state->packed.last_used_idx;
547 vq->state.split.avail_index = state->split.avail_index;
552 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
553 struct vdpa_vq_state *state)
555 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
556 struct vduse_virtqueue *vq = &dev->vqs[idx];
558 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
559 return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
561 return vduse_dev_get_vq_state_split(dev, vq, &state->split);
564 static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
566 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
568 return dev->vq_align;
571 static u64 vduse_vdpa_get_features(struct vdpa_device *vdpa)
573 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
575 return dev->device_features;
578 static int vduse_vdpa_set_features(struct vdpa_device *vdpa, u64 features)
580 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
582 dev->driver_features = features;
586 static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
587 struct vdpa_callback *cb)
589 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
591 spin_lock(&dev->irq_lock);
592 dev->config_cb.callback = cb->callback;
593 dev->config_cb.private = cb->private;
594 spin_unlock(&dev->irq_lock);
597 static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
599 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
603 for (i = 0; i < dev->vq_num; i++)
604 if (num_max < dev->vqs[i].num_max)
605 num_max = dev->vqs[i].num_max;
610 static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
612 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
614 return dev->device_id;
617 static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
619 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
621 return dev->vendor_id;
624 static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
626 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
631 static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
633 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
635 if (vduse_dev_set_status(dev, status))
638 dev->status = status;
641 static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
643 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
645 return dev->config_size;
648 static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
649 void *buf, unsigned int len)
651 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
653 if (len > dev->config_size - offset)
656 memcpy(buf, dev->config + offset, len);
659 static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
660 const void *buf, unsigned int len)
662 /* Now we only support read-only configuration space */
665 static int vduse_vdpa_reset(struct vdpa_device *vdpa)
667 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
668 int ret = vduse_dev_set_status(dev, 0);
670 vduse_dev_reset(dev);
675 static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
677 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
679 return dev->generation;
682 static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
683 struct vhost_iotlb *iotlb)
685 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
688 ret = vduse_domain_set_map(dev->domain, iotlb);
692 ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
694 vduse_domain_clear_map(dev->domain, iotlb);
701 static void vduse_vdpa_free(struct vdpa_device *vdpa)
703 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
708 static const struct vdpa_config_ops vduse_vdpa_config_ops = {
709 .set_vq_address = vduse_vdpa_set_vq_address,
710 .kick_vq = vduse_vdpa_kick_vq,
711 .set_vq_cb = vduse_vdpa_set_vq_cb,
712 .set_vq_num = vduse_vdpa_set_vq_num,
713 .set_vq_ready = vduse_vdpa_set_vq_ready,
714 .get_vq_ready = vduse_vdpa_get_vq_ready,
715 .set_vq_state = vduse_vdpa_set_vq_state,
716 .get_vq_state = vduse_vdpa_get_vq_state,
717 .get_vq_align = vduse_vdpa_get_vq_align,
718 .get_features = vduse_vdpa_get_features,
719 .set_features = vduse_vdpa_set_features,
720 .set_config_cb = vduse_vdpa_set_config_cb,
721 .get_vq_num_max = vduse_vdpa_get_vq_num_max,
722 .get_device_id = vduse_vdpa_get_device_id,
723 .get_vendor_id = vduse_vdpa_get_vendor_id,
724 .get_status = vduse_vdpa_get_status,
725 .set_status = vduse_vdpa_set_status,
726 .get_config_size = vduse_vdpa_get_config_size,
727 .get_config = vduse_vdpa_get_config,
728 .set_config = vduse_vdpa_set_config,
729 .get_generation = vduse_vdpa_get_generation,
730 .reset = vduse_vdpa_reset,
731 .set_map = vduse_vdpa_set_map,
732 .free = vduse_vdpa_free,
735 static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
736 unsigned long offset, size_t size,
737 enum dma_data_direction dir,
740 struct vduse_dev *vdev = dev_to_vduse(dev);
741 struct vduse_iova_domain *domain = vdev->domain;
743 return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
746 static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
747 size_t size, enum dma_data_direction dir,
750 struct vduse_dev *vdev = dev_to_vduse(dev);
751 struct vduse_iova_domain *domain = vdev->domain;
753 return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
756 static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
757 dma_addr_t *dma_addr, gfp_t flag,
760 struct vduse_dev *vdev = dev_to_vduse(dev);
761 struct vduse_iova_domain *domain = vdev->domain;
765 *dma_addr = DMA_MAPPING_ERROR;
766 addr = vduse_domain_alloc_coherent(domain, size,
767 (dma_addr_t *)&iova, flag, attrs);
771 *dma_addr = (dma_addr_t)iova;
776 static void vduse_dev_free_coherent(struct device *dev, size_t size,
777 void *vaddr, dma_addr_t dma_addr,
780 struct vduse_dev *vdev = dev_to_vduse(dev);
781 struct vduse_iova_domain *domain = vdev->domain;
783 vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
786 static size_t vduse_dev_max_mapping_size(struct device *dev)
788 struct vduse_dev *vdev = dev_to_vduse(dev);
789 struct vduse_iova_domain *domain = vdev->domain;
791 return domain->bounce_size;
794 static const struct dma_map_ops vduse_dev_dma_ops = {
795 .map_page = vduse_dev_map_page,
796 .unmap_page = vduse_dev_unmap_page,
797 .alloc = vduse_dev_alloc_coherent,
798 .free = vduse_dev_free_coherent,
799 .max_mapping_size = vduse_dev_max_mapping_size,
802 static unsigned int perm_to_file_flags(u8 perm)
804 unsigned int flags = 0;
807 case VDUSE_ACCESS_WO:
810 case VDUSE_ACCESS_RO:
813 case VDUSE_ACCESS_RW:
817 WARN(1, "invalidate vhost IOTLB permission\n");
824 static int vduse_kickfd_setup(struct vduse_dev *dev,
825 struct vduse_vq_eventfd *eventfd)
827 struct eventfd_ctx *ctx = NULL;
828 struct vduse_virtqueue *vq;
831 if (eventfd->index >= dev->vq_num)
834 index = array_index_nospec(eventfd->index, dev->vq_num);
835 vq = &dev->vqs[index];
836 if (eventfd->fd >= 0) {
837 ctx = eventfd_ctx_fdget(eventfd->fd);
840 } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
843 spin_lock(&vq->kick_lock);
845 eventfd_ctx_put(vq->kickfd);
847 if (vq->ready && vq->kicked && vq->kickfd) {
848 eventfd_signal(vq->kickfd, 1);
851 spin_unlock(&vq->kick_lock);
856 static bool vduse_dev_is_ready(struct vduse_dev *dev)
860 for (i = 0; i < dev->vq_num; i++)
861 if (!dev->vqs[i].num_max)
867 static void vduse_dev_irq_inject(struct work_struct *work)
869 struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
871 spin_lock_irq(&dev->irq_lock);
872 if (dev->config_cb.callback)
873 dev->config_cb.callback(dev->config_cb.private);
874 spin_unlock_irq(&dev->irq_lock);
877 static void vduse_vq_irq_inject(struct work_struct *work)
879 struct vduse_virtqueue *vq = container_of(work,
880 struct vduse_virtqueue, inject);
882 spin_lock_irq(&vq->irq_lock);
883 if (vq->ready && vq->cb.callback)
884 vq->cb.callback(vq->cb.private);
885 spin_unlock_irq(&vq->irq_lock);
888 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
891 struct vduse_dev *dev = file->private_data;
892 void __user *argp = (void __user *)arg;
895 if (unlikely(dev->broken))
899 case VDUSE_IOTLB_GET_FD: {
900 struct vduse_iotlb_entry entry;
901 struct vhost_iotlb_map *map;
902 struct vdpa_map_file *map_file;
903 struct vduse_iova_domain *domain = dev->domain;
904 struct file *f = NULL;
907 if (copy_from_user(&entry, argp, sizeof(entry)))
911 if (entry.start > entry.last)
914 spin_lock(&domain->iotlb_lock);
915 map = vhost_iotlb_itree_first(domain->iotlb,
916 entry.start, entry.last);
918 map_file = (struct vdpa_map_file *)map->opaque;
919 f = get_file(map_file->file);
920 entry.offset = map_file->offset;
921 entry.start = map->start;
922 entry.last = map->last;
923 entry.perm = map->perm;
925 spin_unlock(&domain->iotlb_lock);
931 if (copy_to_user(argp, &entry, sizeof(entry))) {
935 ret = receive_fd(f, perm_to_file_flags(entry.perm));
939 case VDUSE_DEV_GET_FEATURES:
941 * Just mirror what driver wrote here.
942 * The driver is expected to check FEATURE_OK later.
944 ret = put_user(dev->driver_features, (u64 __user *)argp);
946 case VDUSE_DEV_SET_CONFIG: {
947 struct vduse_config_data config;
948 unsigned long size = offsetof(struct vduse_config_data,
952 if (copy_from_user(&config, argp, size))
956 if (config.length == 0 ||
957 config.length > dev->config_size - config.offset)
961 if (copy_from_user(dev->config + config.offset, argp + size,
968 case VDUSE_DEV_INJECT_CONFIG_IRQ:
970 queue_work(vduse_irq_wq, &dev->inject);
972 case VDUSE_VQ_SETUP: {
973 struct vduse_vq_config config;
977 if (copy_from_user(&config, argp, sizeof(config)))
981 if (config.index >= dev->vq_num)
984 if (!is_mem_zero((const char *)config.reserved,
985 sizeof(config.reserved)))
988 index = array_index_nospec(config.index, dev->vq_num);
989 dev->vqs[index].num_max = config.max_size;
993 case VDUSE_VQ_GET_INFO: {
994 struct vduse_vq_info vq_info;
995 struct vduse_virtqueue *vq;
999 if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
1003 if (vq_info.index >= dev->vq_num)
1006 index = array_index_nospec(vq_info.index, dev->vq_num);
1007 vq = &dev->vqs[index];
1008 vq_info.desc_addr = vq->desc_addr;
1009 vq_info.driver_addr = vq->driver_addr;
1010 vq_info.device_addr = vq->device_addr;
1011 vq_info.num = vq->num;
1013 if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
1014 vq_info.packed.last_avail_counter =
1015 vq->state.packed.last_avail_counter;
1016 vq_info.packed.last_avail_idx =
1017 vq->state.packed.last_avail_idx;
1018 vq_info.packed.last_used_counter =
1019 vq->state.packed.last_used_counter;
1020 vq_info.packed.last_used_idx =
1021 vq->state.packed.last_used_idx;
1023 vq_info.split.avail_index =
1024 vq->state.split.avail_index;
1026 vq_info.ready = vq->ready;
1029 if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
1035 case VDUSE_VQ_SETUP_KICKFD: {
1036 struct vduse_vq_eventfd eventfd;
1039 if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
1042 ret = vduse_kickfd_setup(dev, &eventfd);
1045 case VDUSE_VQ_INJECT_IRQ: {
1049 if (get_user(index, (u32 __user *)argp))
1053 if (index >= dev->vq_num)
1057 index = array_index_nospec(index, dev->vq_num);
1058 queue_work(vduse_irq_wq, &dev->vqs[index].inject);
1069 static int vduse_dev_release(struct inode *inode, struct file *file)
1071 struct vduse_dev *dev = file->private_data;
1073 spin_lock(&dev->msg_lock);
1074 /* Make sure the inflight messages can processed after reconncection */
1075 list_splice_init(&dev->recv_list, &dev->send_list);
1076 spin_unlock(&dev->msg_lock);
1077 dev->connected = false;
1082 static struct vduse_dev *vduse_dev_get_from_minor(int minor)
1084 struct vduse_dev *dev;
1086 mutex_lock(&vduse_lock);
1087 dev = idr_find(&vduse_idr, minor);
1088 mutex_unlock(&vduse_lock);
1093 static int vduse_dev_open(struct inode *inode, struct file *file)
1096 struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
1102 mutex_lock(&dev->lock);
1107 dev->connected = true;
1108 file->private_data = dev;
1110 mutex_unlock(&dev->lock);
1115 static const struct file_operations vduse_dev_fops = {
1116 .owner = THIS_MODULE,
1117 .open = vduse_dev_open,
1118 .release = vduse_dev_release,
1119 .read_iter = vduse_dev_read_iter,
1120 .write_iter = vduse_dev_write_iter,
1121 .poll = vduse_dev_poll,
1122 .unlocked_ioctl = vduse_dev_ioctl,
1123 .compat_ioctl = compat_ptr_ioctl,
1124 .llseek = noop_llseek,
1127 static struct vduse_dev *vduse_dev_create(void)
1129 struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1134 mutex_init(&dev->lock);
1135 spin_lock_init(&dev->msg_lock);
1136 INIT_LIST_HEAD(&dev->send_list);
1137 INIT_LIST_HEAD(&dev->recv_list);
1138 spin_lock_init(&dev->irq_lock);
1140 INIT_WORK(&dev->inject, vduse_dev_irq_inject);
1141 init_waitqueue_head(&dev->waitq);
1146 static void vduse_dev_destroy(struct vduse_dev *dev)
1151 static struct vduse_dev *vduse_find_dev(const char *name)
1153 struct vduse_dev *dev;
1156 idr_for_each_entry(&vduse_idr, dev, id)
1157 if (!strcmp(dev->name, name))
1163 static int vduse_destroy_dev(char *name)
1165 struct vduse_dev *dev = vduse_find_dev(name);
1170 mutex_lock(&dev->lock);
1171 if (dev->vdev || dev->connected) {
1172 mutex_unlock(&dev->lock);
1175 dev->connected = true;
1176 mutex_unlock(&dev->lock);
1178 vduse_dev_reset(dev);
1179 device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
1180 idr_remove(&vduse_idr, dev->minor);
1181 kvfree(dev->config);
1183 vduse_domain_destroy(dev->domain);
1185 vduse_dev_destroy(dev);
1186 module_put(THIS_MODULE);
1191 static bool device_is_allowed(u32 device_id)
1195 for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
1196 if (allowed_device_id[i] == device_id)
1202 static bool features_is_valid(u64 features)
1204 if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
1207 /* Now we only support read-only configuration space */
1208 if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
1214 static bool vduse_validate_config(struct vduse_dev_config *config)
1216 if (!is_mem_zero((const char *)config->reserved,
1217 sizeof(config->reserved)))
1220 if (config->vq_align > PAGE_SIZE)
1223 if (config->config_size > PAGE_SIZE)
1226 if (!device_is_allowed(config->device_id))
1229 if (!features_is_valid(config->features))
1235 static ssize_t msg_timeout_show(struct device *device,
1236 struct device_attribute *attr, char *buf)
1238 struct vduse_dev *dev = dev_get_drvdata(device);
1240 return sysfs_emit(buf, "%u\n", dev->msg_timeout);
1243 static ssize_t msg_timeout_store(struct device *device,
1244 struct device_attribute *attr,
1245 const char *buf, size_t count)
1247 struct vduse_dev *dev = dev_get_drvdata(device);
1250 ret = kstrtouint(buf, 10, &dev->msg_timeout);
1257 static DEVICE_ATTR_RW(msg_timeout);
1259 static struct attribute *vduse_dev_attrs[] = {
1260 &dev_attr_msg_timeout.attr,
1264 ATTRIBUTE_GROUPS(vduse_dev);
1266 static int vduse_create_dev(struct vduse_dev_config *config,
1267 void *config_buf, u64 api_version)
1270 struct vduse_dev *dev;
1273 if (vduse_find_dev(config->name))
1277 dev = vduse_dev_create();
1281 dev->api_version = api_version;
1282 dev->device_features = config->features;
1283 dev->device_id = config->device_id;
1284 dev->vendor_id = config->vendor_id;
1285 dev->name = kstrdup(config->name, GFP_KERNEL);
1289 dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
1294 dev->config = config_buf;
1295 dev->config_size = config->config_size;
1296 dev->vq_align = config->vq_align;
1297 dev->vq_num = config->vq_num;
1298 dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
1302 for (i = 0; i < dev->vq_num; i++) {
1303 dev->vqs[i].index = i;
1304 INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
1305 INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
1306 spin_lock_init(&dev->vqs[i].kick_lock);
1307 spin_lock_init(&dev->vqs[i].irq_lock);
1310 ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
1315 dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
1316 dev->dev = device_create(vduse_class, NULL,
1317 MKDEV(MAJOR(vduse_major), dev->minor),
1318 dev, "%s", config->name);
1319 if (IS_ERR(dev->dev)) {
1320 ret = PTR_ERR(dev->dev);
1323 __module_get(THIS_MODULE);
1327 idr_remove(&vduse_idr, dev->minor);
1331 vduse_domain_destroy(dev->domain);
1335 vduse_dev_destroy(dev);
1341 static long vduse_ioctl(struct file *file, unsigned int cmd,
1345 void __user *argp = (void __user *)arg;
1346 struct vduse_control *control = file->private_data;
1348 mutex_lock(&vduse_lock);
1350 case VDUSE_GET_API_VERSION:
1351 ret = put_user(control->api_version, (u64 __user *)argp);
1353 case VDUSE_SET_API_VERSION: {
1357 if (get_user(api_version, (u64 __user *)argp))
1361 if (api_version > VDUSE_API_VERSION)
1365 control->api_version = api_version;
1368 case VDUSE_CREATE_DEV: {
1369 struct vduse_dev_config config;
1370 unsigned long size = offsetof(struct vduse_dev_config, config);
1374 if (copy_from_user(&config, argp, size))
1378 if (vduse_validate_config(&config) == false)
1381 buf = vmemdup_user(argp + size, config.config_size);
1386 config.name[VDUSE_NAME_MAX - 1] = '\0';
1387 ret = vduse_create_dev(&config, buf, control->api_version);
1390 case VDUSE_DESTROY_DEV: {
1391 char name[VDUSE_NAME_MAX];
1394 if (copy_from_user(name, argp, VDUSE_NAME_MAX))
1397 name[VDUSE_NAME_MAX - 1] = '\0';
1398 ret = vduse_destroy_dev(name);
1405 mutex_unlock(&vduse_lock);
1410 static int vduse_release(struct inode *inode, struct file *file)
1412 struct vduse_control *control = file->private_data;
1418 static int vduse_open(struct inode *inode, struct file *file)
1420 struct vduse_control *control;
1422 control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
1426 control->api_version = VDUSE_API_VERSION;
1427 file->private_data = control;
1432 static const struct file_operations vduse_ctrl_fops = {
1433 .owner = THIS_MODULE,
1435 .release = vduse_release,
1436 .unlocked_ioctl = vduse_ioctl,
1437 .compat_ioctl = compat_ptr_ioctl,
1438 .llseek = noop_llseek,
1441 static char *vduse_devnode(struct device *dev, umode_t *mode)
1443 return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
1446 static void vduse_mgmtdev_release(struct device *dev)
1450 static struct device vduse_mgmtdev = {
1451 .init_name = "vduse",
1452 .release = vduse_mgmtdev_release,
1455 static struct vdpa_mgmt_dev mgmt_dev;
1457 static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
1459 struct vduse_vdpa *vdev;
1465 vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
1466 &vduse_vdpa_config_ops, name, true);
1468 return PTR_ERR(vdev);
1472 vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
1473 ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
1475 put_device(&vdev->vdpa.dev);
1478 set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
1479 vdev->vdpa.dma_dev = &vdev->vdpa.dev;
1480 vdev->vdpa.mdev = &mgmt_dev;
1485 static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
1487 struct vduse_dev *dev;
1490 mutex_lock(&vduse_lock);
1491 dev = vduse_find_dev(name);
1492 if (!dev || !vduse_dev_is_ready(dev)) {
1493 mutex_unlock(&vduse_lock);
1496 ret = vduse_dev_init_vdpa(dev, name);
1497 mutex_unlock(&vduse_lock);
1501 ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
1503 put_device(&dev->vdev->vdpa.dev);
1510 static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
1512 _vdpa_unregister_device(dev);
1515 static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
1516 .dev_add = vdpa_dev_add,
1517 .dev_del = vdpa_dev_del,
1520 static struct virtio_device_id id_table[] = {
1521 { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
1525 static struct vdpa_mgmt_dev mgmt_dev = {
1526 .device = &vduse_mgmtdev,
1527 .id_table = id_table,
1528 .ops = &vdpa_dev_mgmtdev_ops,
1531 static int vduse_mgmtdev_init(void)
1535 ret = device_register(&vduse_mgmtdev);
1539 ret = vdpa_mgmtdev_register(&mgmt_dev);
1545 device_unregister(&vduse_mgmtdev);
1549 static void vduse_mgmtdev_exit(void)
1551 vdpa_mgmtdev_unregister(&mgmt_dev);
1552 device_unregister(&vduse_mgmtdev);
1555 static int vduse_init(void)
1560 vduse_class = class_create(THIS_MODULE, "vduse");
1561 if (IS_ERR(vduse_class))
1562 return PTR_ERR(vduse_class);
1564 vduse_class->devnode = vduse_devnode;
1565 vduse_class->dev_groups = vduse_dev_groups;
1567 ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
1569 goto err_chardev_region;
1571 /* /dev/vduse/control */
1572 cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
1573 vduse_ctrl_cdev.owner = THIS_MODULE;
1574 ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
1578 dev = device_create(vduse_class, NULL, vduse_major, NULL, "control");
1584 /* /dev/vduse/$DEVICE */
1585 cdev_init(&vduse_cdev, &vduse_dev_fops);
1586 vduse_cdev.owner = THIS_MODULE;
1587 ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
1592 vduse_irq_wq = alloc_workqueue("vduse-irq",
1593 WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
1594 if (!vduse_irq_wq) {
1599 ret = vduse_domain_init();
1603 ret = vduse_mgmtdev_init();
1609 vduse_domain_exit();
1611 destroy_workqueue(vduse_irq_wq);
1613 cdev_del(&vduse_cdev);
1615 device_destroy(vduse_class, vduse_major);
1617 cdev_del(&vduse_ctrl_cdev);
1619 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
1621 class_destroy(vduse_class);
1624 module_init(vduse_init);
1626 static void vduse_exit(void)
1628 vduse_mgmtdev_exit();
1629 vduse_domain_exit();
1630 destroy_workqueue(vduse_irq_wq);
1631 cdev_del(&vduse_cdev);
1632 device_destroy(vduse_class, vduse_major);
1633 cdev_del(&vduse_ctrl_cdev);
1634 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
1635 class_destroy(vduse_class);
1637 module_exit(vduse_exit);
1639 MODULE_LICENSE(DRV_LICENSE);
1640 MODULE_AUTHOR(DRV_AUTHOR);
1641 MODULE_DESCRIPTION(DRV_DESC);