1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
37 #define DRIVER_VERSION "0.3"
38 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC "VFIO - User Level meta-driver"
43 struct list_head iommu_drivers_list;
44 struct mutex iommu_drivers_lock;
45 struct list_head group_list;
46 struct mutex group_lock; /* locks group_list */
51 struct vfio_iommu_driver {
52 const struct vfio_iommu_driver_ops *ops;
53 struct list_head vfio_next;
56 struct vfio_container {
58 struct list_head group_list;
59 struct rw_semaphore group_lock;
60 struct vfio_iommu_driver *iommu_driver;
65 struct vfio_unbound_dev {
67 struct list_head unbound_next;
74 atomic_t container_users;
75 struct iommu_group *iommu_group;
76 struct vfio_container *container;
77 struct list_head device_list;
78 struct mutex device_lock;
79 struct notifier_block nb;
80 struct list_head vfio_next;
81 struct list_head container_next;
82 struct list_head unbound_list;
83 struct mutex unbound_lock;
85 wait_queue_head_t container_q;
86 enum vfio_group_type type;
87 unsigned int dev_counter;
89 struct blocking_notifier_head notifier;
92 #ifdef CONFIG_VFIO_NOIOMMU
93 static bool noiommu __read_mostly;
94 module_param_named(enable_unsafe_noiommu_mode,
95 noiommu, bool, S_IRUGO | S_IWUSR);
96 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
99 static DEFINE_XARRAY(vfio_device_set_xa);
100 static const struct file_operations vfio_group_fops;
102 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
104 unsigned long idx = (unsigned long)set_id;
105 struct vfio_device_set *new_dev_set;
106 struct vfio_device_set *dev_set;
108 if (WARN_ON(!set_id))
112 * Atomically acquire a singleton object in the xarray for this set_id
114 xa_lock(&vfio_device_set_xa);
115 dev_set = xa_load(&vfio_device_set_xa, idx);
118 xa_unlock(&vfio_device_set_xa);
120 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
123 mutex_init(&new_dev_set->lock);
124 INIT_LIST_HEAD(&new_dev_set->device_list);
125 new_dev_set->set_id = set_id;
127 xa_lock(&vfio_device_set_xa);
128 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
131 dev_set = new_dev_set;
136 if (xa_is_err(dev_set)) {
137 xa_unlock(&vfio_device_set_xa);
138 return xa_err(dev_set);
142 dev_set->device_count++;
143 xa_unlock(&vfio_device_set_xa);
144 mutex_lock(&dev_set->lock);
145 device->dev_set = dev_set;
146 list_add_tail(&device->dev_set_list, &dev_set->device_list);
147 mutex_unlock(&dev_set->lock);
150 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
152 static void vfio_release_device_set(struct vfio_device *device)
154 struct vfio_device_set *dev_set = device->dev_set;
159 mutex_lock(&dev_set->lock);
160 list_del(&device->dev_set_list);
161 mutex_unlock(&dev_set->lock);
163 xa_lock(&vfio_device_set_xa);
164 if (!--dev_set->device_count) {
165 __xa_erase(&vfio_device_set_xa,
166 (unsigned long)dev_set->set_id);
167 mutex_destroy(&dev_set->lock);
170 xa_unlock(&vfio_device_set_xa);
173 #ifdef CONFIG_VFIO_NOIOMMU
174 static void *vfio_noiommu_open(unsigned long arg)
176 if (arg != VFIO_NOIOMMU_IOMMU)
177 return ERR_PTR(-EINVAL);
178 if (!capable(CAP_SYS_RAWIO))
179 return ERR_PTR(-EPERM);
184 static void vfio_noiommu_release(void *iommu_data)
188 static long vfio_noiommu_ioctl(void *iommu_data,
189 unsigned int cmd, unsigned long arg)
191 if (cmd == VFIO_CHECK_EXTENSION)
192 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
197 static int vfio_noiommu_attach_group(void *iommu_data,
198 struct iommu_group *iommu_group, enum vfio_group_type type)
203 static void vfio_noiommu_detach_group(void *iommu_data,
204 struct iommu_group *iommu_group)
208 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
209 .name = "vfio-noiommu",
210 .owner = THIS_MODULE,
211 .open = vfio_noiommu_open,
212 .release = vfio_noiommu_release,
213 .ioctl = vfio_noiommu_ioctl,
214 .attach_group = vfio_noiommu_attach_group,
215 .detach_group = vfio_noiommu_detach_group,
219 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
222 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
223 const struct vfio_iommu_driver *driver)
225 return container->noiommu == (driver->ops == &vfio_noiommu_ops);
228 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
229 const struct vfio_iommu_driver *driver)
233 #endif /* CONFIG_VFIO_NOIOMMU */
236 * IOMMU driver registration
238 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
240 struct vfio_iommu_driver *driver, *tmp;
242 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
248 mutex_lock(&vfio.iommu_drivers_lock);
250 /* Check for duplicates */
251 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
252 if (tmp->ops == ops) {
253 mutex_unlock(&vfio.iommu_drivers_lock);
259 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
261 mutex_unlock(&vfio.iommu_drivers_lock);
265 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
267 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
269 struct vfio_iommu_driver *driver;
271 mutex_lock(&vfio.iommu_drivers_lock);
272 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
273 if (driver->ops == ops) {
274 list_del(&driver->vfio_next);
275 mutex_unlock(&vfio.iommu_drivers_lock);
280 mutex_unlock(&vfio.iommu_drivers_lock);
282 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
284 static int vfio_iommu_group_notifier(struct notifier_block *nb,
285 unsigned long action, void *data);
286 static void vfio_group_get(struct vfio_group *group);
289 * Container objects - containers are created when /dev/vfio/vfio is
290 * opened, but their lifecycle extends until the last user is done, so
291 * it's freed via kref. Must support container/group/device being
292 * closed in any order.
294 static void vfio_container_get(struct vfio_container *container)
296 kref_get(&container->kref);
299 static void vfio_container_release(struct kref *kref)
301 struct vfio_container *container;
302 container = container_of(kref, struct vfio_container, kref);
307 static void vfio_container_put(struct vfio_container *container)
309 kref_put(&container->kref, vfio_container_release);
313 * Group objects - create, release, get, put, search
315 static struct vfio_group *
316 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
318 struct vfio_group *group;
320 list_for_each_entry(group, &vfio.group_list, vfio_next) {
321 if (group->iommu_group == iommu_group) {
322 vfio_group_get(group);
329 static struct vfio_group *
330 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
332 struct vfio_group *group;
334 mutex_lock(&vfio.group_lock);
335 group = __vfio_group_get_from_iommu(iommu_group);
336 mutex_unlock(&vfio.group_lock);
340 static void vfio_group_release(struct device *dev)
342 struct vfio_group *group = container_of(dev, struct vfio_group, dev);
343 struct vfio_unbound_dev *unbound, *tmp;
345 list_for_each_entry_safe(unbound, tmp,
346 &group->unbound_list, unbound_next) {
347 list_del(&unbound->unbound_next);
351 mutex_destroy(&group->device_lock);
352 mutex_destroy(&group->unbound_lock);
353 iommu_group_put(group->iommu_group);
354 ida_free(&vfio.group_ida, MINOR(group->dev.devt));
358 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
359 enum vfio_group_type type)
361 struct vfio_group *group;
364 group = kzalloc(sizeof(*group), GFP_KERNEL);
366 return ERR_PTR(-ENOMEM);
368 minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
371 return ERR_PTR(minor);
374 device_initialize(&group->dev);
375 group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
376 group->dev.class = vfio.class;
377 group->dev.release = vfio_group_release;
378 cdev_init(&group->cdev, &vfio_group_fops);
379 group->cdev.owner = THIS_MODULE;
381 refcount_set(&group->users, 1);
382 INIT_LIST_HEAD(&group->device_list);
383 mutex_init(&group->device_lock);
384 INIT_LIST_HEAD(&group->unbound_list);
385 mutex_init(&group->unbound_lock);
386 init_waitqueue_head(&group->container_q);
387 group->iommu_group = iommu_group;
388 /* put in vfio_group_release() */
389 iommu_group_ref_get(iommu_group);
391 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
396 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
397 enum vfio_group_type type)
399 struct vfio_group *group;
400 struct vfio_group *ret;
403 group = vfio_group_alloc(iommu_group, type);
407 err = dev_set_name(&group->dev, "%s%d",
408 group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
409 iommu_group_id(iommu_group));
415 group->nb.notifier_call = vfio_iommu_group_notifier;
416 err = iommu_group_register_notifier(iommu_group, &group->nb);
422 mutex_lock(&vfio.group_lock);
424 /* Did we race creating this group? */
425 ret = __vfio_group_get_from_iommu(iommu_group);
429 err = cdev_device_add(&group->cdev, &group->dev);
435 list_add(&group->vfio_next, &vfio.group_list);
437 mutex_unlock(&vfio.group_lock);
441 mutex_unlock(&vfio.group_lock);
442 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
444 put_device(&group->dev);
448 static void vfio_group_put(struct vfio_group *group)
450 if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
454 * These data structures all have paired operations that can only be
455 * undone when the caller holds a live reference on the group. Since all
456 * pairs must be undone these WARN_ON's indicate some caller did not
457 * properly hold the group reference.
459 WARN_ON(!list_empty(&group->device_list));
460 WARN_ON(atomic_read(&group->container_users));
461 WARN_ON(group->notifier.head);
463 list_del(&group->vfio_next);
464 cdev_device_del(&group->cdev, &group->dev);
465 mutex_unlock(&vfio.group_lock);
467 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
468 put_device(&group->dev);
471 static void vfio_group_get(struct vfio_group *group)
473 refcount_inc(&group->users);
476 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
478 struct iommu_group *iommu_group;
479 struct vfio_group *group;
481 iommu_group = iommu_group_get(dev);
485 group = vfio_group_get_from_iommu(iommu_group);
486 iommu_group_put(iommu_group);
492 * Device objects - create, release, get, put, search
494 /* Device reference always implies a group reference */
495 void vfio_device_put(struct vfio_device *device)
497 if (refcount_dec_and_test(&device->refcount))
498 complete(&device->comp);
500 EXPORT_SYMBOL_GPL(vfio_device_put);
502 static bool vfio_device_try_get(struct vfio_device *device)
504 return refcount_inc_not_zero(&device->refcount);
507 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
510 struct vfio_device *device;
512 mutex_lock(&group->device_lock);
513 list_for_each_entry(device, &group->device_list, group_next) {
514 if (device->dev == dev && vfio_device_try_get(device)) {
515 mutex_unlock(&group->device_lock);
519 mutex_unlock(&group->device_lock);
524 * Some drivers, like pci-stub, are only used to prevent other drivers from
525 * claiming a device and are therefore perfectly legitimate for a user owned
526 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
527 * of the device, but it does prevent the user from having direct access to
528 * the device, which is useful in some circumstances.
530 * We also assume that we can include PCI interconnect devices, ie. bridges.
531 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
532 * then all of the downstream devices will be part of the same IOMMU group as
533 * the bridge. Thus, if placing the bridge into the user owned IOVA space
534 * breaks anything, it only does so for user owned devices downstream. Note
535 * that error notification via MSI can be affected for platforms that handle
536 * MSI within the same IOVA space as DMA.
538 static const char * const vfio_driver_allowed[] = { "pci-stub" };
540 static bool vfio_dev_driver_allowed(struct device *dev,
541 struct device_driver *drv)
543 if (dev_is_pci(dev)) {
544 struct pci_dev *pdev = to_pci_dev(dev);
546 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
550 return match_string(vfio_driver_allowed,
551 ARRAY_SIZE(vfio_driver_allowed),
556 * A vfio group is viable for use by userspace if all devices are in
557 * one of the following states:
559 * - bound to a vfio driver
560 * - bound to an otherwise allowed driver
561 * - a PCI interconnect device
563 * We use two methods to determine whether a device is bound to a vfio
564 * driver. The first is to test whether the device exists in the vfio
565 * group. The second is to test if the device exists on the group
566 * unbound_list, indicating it's in the middle of transitioning from
567 * a vfio driver to driver-less.
569 static int vfio_dev_viable(struct device *dev, void *data)
571 struct vfio_group *group = data;
572 struct vfio_device *device;
573 struct device_driver *drv = READ_ONCE(dev->driver);
574 struct vfio_unbound_dev *unbound;
577 mutex_lock(&group->unbound_lock);
578 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
579 if (dev == unbound->dev) {
584 mutex_unlock(&group->unbound_lock);
586 if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
589 device = vfio_group_get_device(group, dev);
591 vfio_device_put(device);
599 * Async device support
601 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
603 struct vfio_device *device;
605 /* Do we already know about it? We shouldn't */
606 device = vfio_group_get_device(group, dev);
607 if (WARN_ON_ONCE(device)) {
608 vfio_device_put(device);
612 /* Nothing to do for idle groups */
613 if (!atomic_read(&group->container_users))
616 /* TODO Prevent device auto probing */
617 dev_WARN(dev, "Device added to live group %d!\n",
618 iommu_group_id(group->iommu_group));
623 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
625 /* We don't care what happens when the group isn't in use */
626 if (!atomic_read(&group->container_users))
629 return vfio_dev_viable(dev, group);
632 static int vfio_iommu_group_notifier(struct notifier_block *nb,
633 unsigned long action, void *data)
635 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
636 struct device *dev = data;
637 struct vfio_unbound_dev *unbound;
640 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
641 vfio_group_nb_add_dev(group, dev);
643 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
645 * Nothing to do here. If the device is in use, then the
646 * vfio sub-driver should block the remove callback until
647 * it is unused. If the device is unused or attached to a
648 * stub driver, then it should be released and we don't
649 * care that it will be going away.
652 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
653 dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
654 iommu_group_id(group->iommu_group));
656 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
657 dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
658 iommu_group_id(group->iommu_group), dev->driver->name);
659 BUG_ON(vfio_group_nb_verify(group, dev));
661 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
662 dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
663 __func__, iommu_group_id(group->iommu_group),
666 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
667 dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
668 iommu_group_id(group->iommu_group));
670 * XXX An unbound device in a live group is ok, but we'd
671 * really like to avoid the above BUG_ON by preventing other
672 * drivers from binding to it. Once that occurs, we have to
673 * stop the system to maintain isolation. At a minimum, we'd
674 * want a toggle to disable driver auto probe for this device.
677 mutex_lock(&group->unbound_lock);
678 list_for_each_entry(unbound,
679 &group->unbound_list, unbound_next) {
680 if (dev == unbound->dev) {
681 list_del(&unbound->unbound_next);
686 mutex_unlock(&group->unbound_lock);
695 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
696 const struct vfio_device_ops *ops)
698 init_completion(&device->comp);
702 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
704 void vfio_uninit_group_dev(struct vfio_device *device)
706 vfio_release_device_set(device);
708 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
710 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
711 enum vfio_group_type type)
713 struct iommu_group *iommu_group;
714 struct vfio_group *group;
717 iommu_group = iommu_group_alloc();
718 if (IS_ERR(iommu_group))
719 return ERR_CAST(iommu_group);
721 iommu_group_set_name(iommu_group, "vfio-noiommu");
722 ret = iommu_group_add_device(iommu_group, dev);
726 group = vfio_create_group(iommu_group, type);
728 ret = PTR_ERR(group);
729 goto out_remove_device;
731 iommu_group_put(iommu_group);
735 iommu_group_remove_device(dev);
737 iommu_group_put(iommu_group);
741 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
743 struct iommu_group *iommu_group;
744 struct vfio_group *group;
746 iommu_group = iommu_group_get(dev);
747 #ifdef CONFIG_VFIO_NOIOMMU
748 if (!iommu_group && noiommu && !iommu_present(dev->bus)) {
750 * With noiommu enabled, create an IOMMU group for devices that
751 * don't already have one and don't have an iommu_ops on their
752 * bus. Taint the kernel because we're about to give a DMA
753 * capable device to a user without IOMMU protection.
755 group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
756 if (!IS_ERR(group)) {
757 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
758 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
764 return ERR_PTR(-EINVAL);
766 group = vfio_group_get_from_iommu(iommu_group);
768 group = vfio_create_group(iommu_group, VFIO_IOMMU);
770 /* The vfio_group holds a reference to the iommu_group */
771 iommu_group_put(iommu_group);
775 static int __vfio_register_dev(struct vfio_device *device,
776 struct vfio_group *group)
778 struct vfio_device *existing_device;
781 return PTR_ERR(group);
784 * If the driver doesn't specify a set then the device is added to a
785 * singleton set just for itself.
787 if (!device->dev_set)
788 vfio_assign_device_set(device, device);
790 existing_device = vfio_group_get_device(group, device->dev);
791 if (existing_device) {
792 dev_WARN(device->dev, "Device already exists on group %d\n",
793 iommu_group_id(group->iommu_group));
794 vfio_device_put(existing_device);
795 if (group->type == VFIO_NO_IOMMU ||
796 group->type == VFIO_EMULATED_IOMMU)
797 iommu_group_remove_device(device->dev);
798 vfio_group_put(group);
802 /* Our reference on group is moved to the device */
803 device->group = group;
805 /* Refcounting can't start until the driver calls register */
806 refcount_set(&device->refcount, 1);
808 mutex_lock(&group->device_lock);
809 list_add(&device->group_next, &group->device_list);
810 group->dev_counter++;
811 mutex_unlock(&group->device_lock);
816 int vfio_register_group_dev(struct vfio_device *device)
818 return __vfio_register_dev(device,
819 vfio_group_find_or_alloc(device->dev));
821 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
824 * Register a virtual device without IOMMU backing. The user of this
825 * device must not be able to directly trigger unmediated DMA.
827 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
829 return __vfio_register_dev(device,
830 vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
832 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
835 * Get a reference to the vfio_device for a device. Even if the
836 * caller thinks they own the device, they could be racing with a
837 * release call path, so we can't trust drvdata for the shortcut.
838 * Go the long way around, from the iommu_group to the vfio_group
839 * to the vfio_device.
841 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
843 struct vfio_group *group;
844 struct vfio_device *device;
846 group = vfio_group_get_from_dev(dev);
850 device = vfio_group_get_device(group, dev);
851 vfio_group_put(group);
855 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
857 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
860 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
862 mutex_lock(&group->device_lock);
863 list_for_each_entry(it, &group->device_list, group_next) {
866 if (it->ops->match) {
867 ret = it->ops->match(it, buf);
869 device = ERR_PTR(ret);
873 ret = !strcmp(dev_name(it->dev), buf);
876 if (ret && vfio_device_try_get(it)) {
881 mutex_unlock(&group->device_lock);
887 * Decrement the device reference count and wait for the device to be
888 * removed. Open file descriptors for the device... */
889 void vfio_unregister_group_dev(struct vfio_device *device)
891 struct vfio_group *group = device->group;
892 struct vfio_unbound_dev *unbound;
894 bool interrupted = false;
898 * When the device is removed from the group, the group suddenly
899 * becomes non-viable; the device has a driver (until the unbind
900 * completes), but it's not present in the group. This is bad news
901 * for any external users that need to re-acquire a group reference
902 * in order to match and release their existing reference. To
903 * solve this, we track such devices on the unbound_list to bridge
904 * the gap until they're fully unbound.
906 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
908 unbound->dev = device->dev;
909 mutex_lock(&group->unbound_lock);
910 list_add(&unbound->unbound_next, &group->unbound_list);
911 mutex_unlock(&group->unbound_lock);
915 vfio_device_put(device);
916 rc = try_wait_for_completion(&device->comp);
918 if (device->ops->request)
919 device->ops->request(device, i++);
922 rc = wait_for_completion_timeout(&device->comp,
925 rc = wait_for_completion_interruptible_timeout(
926 &device->comp, HZ * 10);
929 dev_warn(device->dev,
930 "Device is currently in use, task"
932 "blocked until device is released",
933 current->comm, task_pid_nr(current));
938 mutex_lock(&group->device_lock);
939 list_del(&device->group_next);
940 group->dev_counter--;
941 mutex_unlock(&group->device_lock);
944 * In order to support multiple devices per group, devices can be
945 * plucked from the group while other devices in the group are still
946 * in use. The container persists with this group and those remaining
947 * devices still attached. If the user creates an isolation violation
948 * by binding this device to another driver while the group is still in
949 * use, that's their fault. However, in the case of removing the last,
950 * or potentially the only, device in the group there can be no other
951 * in-use devices in the group. The user has done their due diligence
952 * and we should lay no claims to those devices. In order to do that,
953 * we need to make sure the group is detached from the container.
954 * Without this stall, we're potentially racing with a user process
955 * that may attempt to immediately bind this device to another driver.
957 if (list_empty(&group->device_list))
958 wait_event(group->container_q, !group->container);
960 if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
961 iommu_group_remove_device(device->dev);
963 /* Matches the get in vfio_register_group_dev() */
964 vfio_group_put(group);
966 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
969 * VFIO base fd, /dev/vfio/vfio
971 static long vfio_ioctl_check_extension(struct vfio_container *container,
974 struct vfio_iommu_driver *driver;
977 down_read(&container->group_lock);
979 driver = container->iommu_driver;
982 /* No base extensions yet */
985 * If no driver is set, poll all registered drivers for
986 * extensions and return the first positive result. If
987 * a driver is already set, further queries will be passed
988 * only to that driver.
991 mutex_lock(&vfio.iommu_drivers_lock);
992 list_for_each_entry(driver, &vfio.iommu_drivers_list,
995 if (!list_empty(&container->group_list) &&
996 !vfio_iommu_driver_allowed(container,
999 if (!try_module_get(driver->ops->owner))
1002 ret = driver->ops->ioctl(NULL,
1003 VFIO_CHECK_EXTENSION,
1005 module_put(driver->ops->owner);
1009 mutex_unlock(&vfio.iommu_drivers_lock);
1011 ret = driver->ops->ioctl(container->iommu_data,
1012 VFIO_CHECK_EXTENSION, arg);
1015 up_read(&container->group_lock);
1020 /* hold write lock on container->group_lock */
1021 static int __vfio_container_attach_groups(struct vfio_container *container,
1022 struct vfio_iommu_driver *driver,
1025 struct vfio_group *group;
1028 list_for_each_entry(group, &container->group_list, container_next) {
1029 ret = driver->ops->attach_group(data, group->iommu_group,
1038 list_for_each_entry_continue_reverse(group, &container->group_list,
1040 driver->ops->detach_group(data, group->iommu_group);
1046 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1049 struct vfio_iommu_driver *driver;
1052 down_write(&container->group_lock);
1055 * The container is designed to be an unprivileged interface while
1056 * the group can be assigned to specific users. Therefore, only by
1057 * adding a group to a container does the user get the privilege of
1058 * enabling the iommu, which may allocate finite resources. There
1059 * is no unset_iommu, but by removing all the groups from a container,
1060 * the container is deprivileged and returns to an unset state.
1062 if (list_empty(&container->group_list) || container->iommu_driver) {
1063 up_write(&container->group_lock);
1067 mutex_lock(&vfio.iommu_drivers_lock);
1068 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1071 if (!vfio_iommu_driver_allowed(container, driver))
1073 if (!try_module_get(driver->ops->owner))
1077 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1078 * so test which iommu driver reported support for this
1079 * extension and call open on them. We also pass them the
1080 * magic, allowing a single driver to support multiple
1081 * interfaces if they'd like.
1083 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1084 module_put(driver->ops->owner);
1088 data = driver->ops->open(arg);
1090 ret = PTR_ERR(data);
1091 module_put(driver->ops->owner);
1095 ret = __vfio_container_attach_groups(container, driver, data);
1097 driver->ops->release(data);
1098 module_put(driver->ops->owner);
1102 container->iommu_driver = driver;
1103 container->iommu_data = data;
1107 mutex_unlock(&vfio.iommu_drivers_lock);
1108 up_write(&container->group_lock);
1113 static long vfio_fops_unl_ioctl(struct file *filep,
1114 unsigned int cmd, unsigned long arg)
1116 struct vfio_container *container = filep->private_data;
1117 struct vfio_iommu_driver *driver;
1125 case VFIO_GET_API_VERSION:
1126 ret = VFIO_API_VERSION;
1128 case VFIO_CHECK_EXTENSION:
1129 ret = vfio_ioctl_check_extension(container, arg);
1131 case VFIO_SET_IOMMU:
1132 ret = vfio_ioctl_set_iommu(container, arg);
1135 driver = container->iommu_driver;
1136 data = container->iommu_data;
1138 if (driver) /* passthrough all unrecognized ioctls */
1139 ret = driver->ops->ioctl(data, cmd, arg);
1145 static int vfio_fops_open(struct inode *inode, struct file *filep)
1147 struct vfio_container *container;
1149 container = kzalloc(sizeof(*container), GFP_KERNEL);
1153 INIT_LIST_HEAD(&container->group_list);
1154 init_rwsem(&container->group_lock);
1155 kref_init(&container->kref);
1157 filep->private_data = container;
1162 static int vfio_fops_release(struct inode *inode, struct file *filep)
1164 struct vfio_container *container = filep->private_data;
1165 struct vfio_iommu_driver *driver = container->iommu_driver;
1167 if (driver && driver->ops->notify)
1168 driver->ops->notify(container->iommu_data,
1169 VFIO_IOMMU_CONTAINER_CLOSE);
1171 filep->private_data = NULL;
1173 vfio_container_put(container);
1178 static const struct file_operations vfio_fops = {
1179 .owner = THIS_MODULE,
1180 .open = vfio_fops_open,
1181 .release = vfio_fops_release,
1182 .unlocked_ioctl = vfio_fops_unl_ioctl,
1183 .compat_ioctl = compat_ptr_ioctl,
1187 * VFIO Group fd, /dev/vfio/$GROUP
1189 static void __vfio_group_unset_container(struct vfio_group *group)
1191 struct vfio_container *container = group->container;
1192 struct vfio_iommu_driver *driver;
1194 down_write(&container->group_lock);
1196 driver = container->iommu_driver;
1198 driver->ops->detach_group(container->iommu_data,
1199 group->iommu_group);
1201 group->container = NULL;
1202 wake_up(&group->container_q);
1203 list_del(&group->container_next);
1205 /* Detaching the last group deprivileges a container, remove iommu */
1206 if (driver && list_empty(&container->group_list)) {
1207 driver->ops->release(container->iommu_data);
1208 module_put(driver->ops->owner);
1209 container->iommu_driver = NULL;
1210 container->iommu_data = NULL;
1213 up_write(&container->group_lock);
1215 vfio_container_put(container);
1219 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1220 * if there was no container to unset. Since the ioctl is called on
1221 * the group, we know that still exists, therefore the only valid
1222 * transition here is 1->0.
1224 static int vfio_group_unset_container(struct vfio_group *group)
1226 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1233 __vfio_group_unset_container(group);
1239 * When removing container users, anything that removes the last user
1240 * implicitly removes the group from the container. That is, if the
1241 * group file descriptor is closed, as well as any device file descriptors,
1242 * the group is free.
1244 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1246 if (0 == atomic_dec_if_positive(&group->container_users))
1247 __vfio_group_unset_container(group);
1250 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1253 struct vfio_container *container;
1254 struct vfio_iommu_driver *driver;
1257 if (atomic_read(&group->container_users))
1260 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1263 f = fdget(container_fd);
1267 /* Sanity check, is this really our fd? */
1268 if (f.file->f_op != &vfio_fops) {
1273 container = f.file->private_data;
1274 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1276 down_write(&container->group_lock);
1278 /* Real groups and fake groups cannot mix */
1279 if (!list_empty(&container->group_list) &&
1280 container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1285 driver = container->iommu_driver;
1287 ret = driver->ops->attach_group(container->iommu_data,
1294 group->container = container;
1295 container->noiommu = (group->type == VFIO_NO_IOMMU);
1296 list_add(&group->container_next, &container->group_list);
1298 /* Get a reference on the container and mark a user within the group */
1299 vfio_container_get(container);
1300 atomic_inc(&group->container_users);
1303 up_write(&container->group_lock);
1308 static bool vfio_group_viable(struct vfio_group *group)
1310 return (iommu_group_for_each_dev(group->iommu_group,
1311 group, vfio_dev_viable) == 0);
1314 static int vfio_group_add_container_user(struct vfio_group *group)
1316 if (!atomic_inc_not_zero(&group->container_users))
1319 if (group->type == VFIO_NO_IOMMU) {
1320 atomic_dec(&group->container_users);
1323 if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1324 atomic_dec(&group->container_users);
1331 static const struct file_operations vfio_device_fops;
1333 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1335 struct vfio_device *device;
1340 if (0 == atomic_read(&group->container_users) ||
1341 !group->container->iommu_driver || !vfio_group_viable(group))
1344 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1347 device = vfio_device_get_from_name(group, buf);
1349 return PTR_ERR(device);
1351 if (!try_module_get(device->dev->driver->owner)) {
1353 goto err_device_put;
1356 mutex_lock(&device->dev_set->lock);
1357 device->open_count++;
1358 if (device->open_count == 1 && device->ops->open_device) {
1359 ret = device->ops->open_device(device);
1361 goto err_undo_count;
1363 mutex_unlock(&device->dev_set->lock);
1366 * We can't use anon_inode_getfd() because we need to modify
1367 * the f_mode flags directly to allow more than just ioctls
1369 fdno = ret = get_unused_fd_flags(O_CLOEXEC);
1371 goto err_close_device;
1373 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1375 if (IS_ERR(filep)) {
1376 ret = PTR_ERR(filep);
1381 * TODO: add an anon_inode interface to do this.
1382 * Appears to be missing by lack of need rather than
1383 * explicitly prevented. Now there's need.
1385 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1387 atomic_inc(&group->container_users);
1389 fd_install(fdno, filep);
1391 if (group->type == VFIO_NO_IOMMU)
1392 dev_warn(device->dev, "vfio-noiommu device opened by user "
1393 "(%s:%d)\n", current->comm, task_pid_nr(current));
1397 put_unused_fd(fdno);
1399 mutex_lock(&device->dev_set->lock);
1400 if (device->open_count == 1 && device->ops->close_device)
1401 device->ops->close_device(device);
1403 device->open_count--;
1404 mutex_unlock(&device->dev_set->lock);
1405 module_put(device->dev->driver->owner);
1407 vfio_device_put(device);
1411 static long vfio_group_fops_unl_ioctl(struct file *filep,
1412 unsigned int cmd, unsigned long arg)
1414 struct vfio_group *group = filep->private_data;
1418 case VFIO_GROUP_GET_STATUS:
1420 struct vfio_group_status status;
1421 unsigned long minsz;
1423 minsz = offsetofend(struct vfio_group_status, flags);
1425 if (copy_from_user(&status, (void __user *)arg, minsz))
1428 if (status.argsz < minsz)
1433 if (vfio_group_viable(group))
1434 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1436 if (group->container)
1437 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1439 if (copy_to_user((void __user *)arg, &status, minsz))
1445 case VFIO_GROUP_SET_CONTAINER:
1449 if (get_user(fd, (int __user *)arg))
1455 ret = vfio_group_set_container(group, fd);
1458 case VFIO_GROUP_UNSET_CONTAINER:
1459 ret = vfio_group_unset_container(group);
1461 case VFIO_GROUP_GET_DEVICE_FD:
1465 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1467 return PTR_ERR(buf);
1469 ret = vfio_group_get_device_fd(group, buf);
1478 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1480 struct vfio_group *group =
1481 container_of(inode->i_cdev, struct vfio_group, cdev);
1484 /* users can be zero if this races with vfio_group_put() */
1485 if (!refcount_inc_not_zero(&group->users))
1488 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1489 vfio_group_put(group);
1493 /* Do we need multiple instances of the group open? Seems not. */
1494 opened = atomic_cmpxchg(&group->opened, 0, 1);
1496 vfio_group_put(group);
1500 /* Is something still in use from a previous open? */
1501 if (group->container) {
1502 atomic_dec(&group->opened);
1503 vfio_group_put(group);
1507 /* Warn if previous user didn't cleanup and re-init to drop them */
1508 if (WARN_ON(group->notifier.head))
1509 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1511 filep->private_data = group;
1516 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1518 struct vfio_group *group = filep->private_data;
1520 filep->private_data = NULL;
1522 vfio_group_try_dissolve_container(group);
1524 atomic_dec(&group->opened);
1526 vfio_group_put(group);
1531 static const struct file_operations vfio_group_fops = {
1532 .owner = THIS_MODULE,
1533 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1534 .compat_ioctl = compat_ptr_ioctl,
1535 .open = vfio_group_fops_open,
1536 .release = vfio_group_fops_release,
1542 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1544 struct vfio_device *device = filep->private_data;
1546 mutex_lock(&device->dev_set->lock);
1547 if (!--device->open_count && device->ops->close_device)
1548 device->ops->close_device(device);
1549 mutex_unlock(&device->dev_set->lock);
1551 module_put(device->dev->driver->owner);
1553 vfio_group_try_dissolve_container(device->group);
1555 vfio_device_put(device);
1560 static long vfio_device_fops_unl_ioctl(struct file *filep,
1561 unsigned int cmd, unsigned long arg)
1563 struct vfio_device *device = filep->private_data;
1565 if (unlikely(!device->ops->ioctl))
1568 return device->ops->ioctl(device, cmd, arg);
1571 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1572 size_t count, loff_t *ppos)
1574 struct vfio_device *device = filep->private_data;
1576 if (unlikely(!device->ops->read))
1579 return device->ops->read(device, buf, count, ppos);
1582 static ssize_t vfio_device_fops_write(struct file *filep,
1583 const char __user *buf,
1584 size_t count, loff_t *ppos)
1586 struct vfio_device *device = filep->private_data;
1588 if (unlikely(!device->ops->write))
1591 return device->ops->write(device, buf, count, ppos);
1594 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1596 struct vfio_device *device = filep->private_data;
1598 if (unlikely(!device->ops->mmap))
1601 return device->ops->mmap(device, vma);
1604 static const struct file_operations vfio_device_fops = {
1605 .owner = THIS_MODULE,
1606 .release = vfio_device_fops_release,
1607 .read = vfio_device_fops_read,
1608 .write = vfio_device_fops_write,
1609 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1610 .compat_ioctl = compat_ptr_ioctl,
1611 .mmap = vfio_device_fops_mmap,
1615 * External user API, exported by symbols to be linked dynamically.
1617 * The protocol includes:
1618 * 1. do normal VFIO init operation:
1619 * - opening a new container;
1620 * - attaching group(s) to it;
1621 * - setting an IOMMU driver for a container.
1622 * When IOMMU is set for a container, all groups in it are
1623 * considered ready to use by an external user.
1625 * 2. User space passes a group fd to an external user.
1626 * The external user calls vfio_group_get_external_user()
1628 * - the group is initialized;
1629 * - IOMMU is set for it.
1630 * If both checks passed, vfio_group_get_external_user()
1631 * increments the container user counter to prevent
1632 * the VFIO group from disposal before KVM exits.
1634 * 3. The external user calls vfio_external_user_iommu_id()
1635 * to know an IOMMU ID.
1637 * 4. When the external KVM finishes, it calls
1638 * vfio_group_put_external_user() to release the VFIO group.
1639 * This call decrements the container user counter.
1641 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1643 struct vfio_group *group = filep->private_data;
1646 if (filep->f_op != &vfio_group_fops)
1647 return ERR_PTR(-EINVAL);
1649 ret = vfio_group_add_container_user(group);
1651 return ERR_PTR(ret);
1654 * Since the caller holds the fget on the file group->users must be >= 1
1656 vfio_group_get(group);
1660 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1663 * External user API, exported by symbols to be linked dynamically.
1664 * The external user passes in a device pointer
1666 * - A VFIO group is assiciated with the device;
1667 * - IOMMU is set for the group.
1668 * If both checks passed, vfio_group_get_external_user_from_dev()
1669 * increments the container user counter to prevent the VFIO group
1670 * from disposal before external user exits and returns the pointer
1671 * to the VFIO group.
1673 * When the external user finishes using the VFIO group, it calls
1674 * vfio_group_put_external_user() to release the VFIO group and
1675 * decrement the container user counter.
1677 * @dev [in] : device
1678 * Return error PTR or pointer to VFIO group.
1681 struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1683 struct vfio_group *group;
1686 group = vfio_group_get_from_dev(dev);
1688 return ERR_PTR(-ENODEV);
1690 ret = vfio_group_add_container_user(group);
1692 vfio_group_put(group);
1693 return ERR_PTR(ret);
1698 EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1700 void vfio_group_put_external_user(struct vfio_group *group)
1702 vfio_group_try_dissolve_container(group);
1703 vfio_group_put(group);
1705 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1707 bool vfio_external_group_match_file(struct vfio_group *test_group,
1710 struct vfio_group *group = filep->private_data;
1712 return (filep->f_op == &vfio_group_fops) && (group == test_group);
1714 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1716 int vfio_external_user_iommu_id(struct vfio_group *group)
1718 return iommu_group_id(group->iommu_group);
1720 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1722 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1724 return vfio_ioctl_check_extension(group->container, arg);
1726 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1729 * Sub-module support
1732 * Helper for managing a buffer of info chain capabilities, allocate or
1733 * reallocate a buffer with additional @size, filling in @id and @version
1734 * of the capability. A pointer to the new capability is returned.
1736 * NB. The chain is based at the head of the buffer, so new entries are
1737 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1738 * next offsets prior to copying to the user buffer.
1740 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1741 size_t size, u16 id, u16 version)
1744 struct vfio_info_cap_header *header, *tmp;
1746 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1750 return ERR_PTR(-ENOMEM);
1754 header = buf + caps->size;
1756 /* Eventually copied to user buffer, zero */
1757 memset(header, 0, size);
1760 header->version = version;
1762 /* Add to the end of the capability chain */
1763 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1766 tmp->next = caps->size;
1771 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1773 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1775 struct vfio_info_cap_header *tmp;
1776 void *buf = (void *)caps->buf;
1778 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1779 tmp->next += offset;
1781 EXPORT_SYMBOL(vfio_info_cap_shift);
1783 int vfio_info_add_capability(struct vfio_info_cap *caps,
1784 struct vfio_info_cap_header *cap, size_t size)
1786 struct vfio_info_cap_header *header;
1788 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1790 return PTR_ERR(header);
1792 memcpy(header + 1, cap + 1, size - sizeof(*header));
1796 EXPORT_SYMBOL(vfio_info_add_capability);
1798 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1799 int max_irq_type, size_t *data_size)
1801 unsigned long minsz;
1804 minsz = offsetofend(struct vfio_irq_set, count);
1806 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1807 (hdr->count >= (U32_MAX - hdr->start)) ||
1808 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1809 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1815 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1818 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1819 case VFIO_IRQ_SET_DATA_NONE:
1822 case VFIO_IRQ_SET_DATA_BOOL:
1823 size = sizeof(uint8_t);
1825 case VFIO_IRQ_SET_DATA_EVENTFD:
1826 size = sizeof(int32_t);
1833 if (hdr->argsz - minsz < hdr->count * size)
1839 *data_size = hdr->count * size;
1844 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1847 * Pin a set of guest PFNs and return their associated host PFNs for local
1849 * @dev [in] : device
1850 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1851 * @npage [in] : count of elements in user_pfn array. This count should not
1852 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1853 * @prot [in] : protection flags
1854 * @phys_pfn[out]: array of host PFNs
1855 * Return error or number of pages pinned.
1857 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1858 int prot, unsigned long *phys_pfn)
1860 struct vfio_container *container;
1861 struct vfio_group *group;
1862 struct vfio_iommu_driver *driver;
1865 if (!dev || !user_pfn || !phys_pfn || !npage)
1868 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1871 group = vfio_group_get_from_dev(dev);
1875 if (group->dev_counter > 1) {
1880 ret = vfio_group_add_container_user(group);
1884 container = group->container;
1885 driver = container->iommu_driver;
1886 if (likely(driver && driver->ops->pin_pages))
1887 ret = driver->ops->pin_pages(container->iommu_data,
1888 group->iommu_group, user_pfn,
1889 npage, prot, phys_pfn);
1893 vfio_group_try_dissolve_container(group);
1896 vfio_group_put(group);
1899 EXPORT_SYMBOL(vfio_pin_pages);
1902 * Unpin set of host PFNs for local domain only.
1903 * @dev [in] : device
1904 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1905 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1906 * @npage [in] : count of elements in user_pfn array. This count should not
1907 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1908 * Return error or number of pages unpinned.
1910 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1912 struct vfio_container *container;
1913 struct vfio_group *group;
1914 struct vfio_iommu_driver *driver;
1917 if (!dev || !user_pfn || !npage)
1920 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1923 group = vfio_group_get_from_dev(dev);
1927 ret = vfio_group_add_container_user(group);
1929 goto err_unpin_pages;
1931 container = group->container;
1932 driver = container->iommu_driver;
1933 if (likely(driver && driver->ops->unpin_pages))
1934 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1939 vfio_group_try_dissolve_container(group);
1942 vfio_group_put(group);
1945 EXPORT_SYMBOL(vfio_unpin_pages);
1948 * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
1951 * The caller needs to call vfio_group_get_external_user() or
1952 * vfio_group_get_external_user_from_dev() prior to calling this interface,
1953 * so as to prevent the VFIO group from disposal in the middle of the call.
1954 * But it can keep the reference to the VFIO group for several calls into
1956 * After finishing using of the VFIO group, the caller needs to release the
1957 * VFIO group by calling vfio_group_put_external_user().
1959 * @group [in] : VFIO group
1960 * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be pinned.
1961 * @npage [in] : count of elements in user_iova_pfn array.
1962 * This count should not be greater
1963 * VFIO_PIN_PAGES_MAX_ENTRIES.
1964 * @prot [in] : protection flags
1965 * @phys_pfn [out] : array of host PFNs
1966 * Return error or number of pages pinned.
1968 int vfio_group_pin_pages(struct vfio_group *group,
1969 unsigned long *user_iova_pfn, int npage,
1970 int prot, unsigned long *phys_pfn)
1972 struct vfio_container *container;
1973 struct vfio_iommu_driver *driver;
1976 if (!group || !user_iova_pfn || !phys_pfn || !npage)
1979 if (group->dev_counter > 1)
1982 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1985 container = group->container;
1986 driver = container->iommu_driver;
1987 if (likely(driver && driver->ops->pin_pages))
1988 ret = driver->ops->pin_pages(container->iommu_data,
1989 group->iommu_group, user_iova_pfn,
1990 npage, prot, phys_pfn);
1996 EXPORT_SYMBOL(vfio_group_pin_pages);
1999 * Unpin a set of guest IOVA PFNs for a VFIO group.
2001 * The caller needs to call vfio_group_get_external_user() or
2002 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2003 * so as to prevent the VFIO group from disposal in the middle of the call.
2004 * But it can keep the reference to the VFIO group for several calls into
2006 * After finishing using of the VFIO group, the caller needs to release the
2007 * VFIO group by calling vfio_group_put_external_user().
2009 * @group [in] : vfio group
2010 * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be unpinned.
2011 * @npage [in] : count of elements in user_iova_pfn array.
2012 * This count should not be greater than
2013 * VFIO_PIN_PAGES_MAX_ENTRIES.
2014 * Return error or number of pages unpinned.
2016 int vfio_group_unpin_pages(struct vfio_group *group,
2017 unsigned long *user_iova_pfn, int npage)
2019 struct vfio_container *container;
2020 struct vfio_iommu_driver *driver;
2023 if (!group || !user_iova_pfn || !npage)
2026 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2029 container = group->container;
2030 driver = container->iommu_driver;
2031 if (likely(driver && driver->ops->unpin_pages))
2032 ret = driver->ops->unpin_pages(container->iommu_data,
2033 user_iova_pfn, npage);
2039 EXPORT_SYMBOL(vfio_group_unpin_pages);
2043 * This interface allows the CPUs to perform some sort of virtual DMA on
2044 * behalf of the device.
2046 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2047 * into/from a kernel buffer.
2049 * As the read/write of user space memory is conducted via the CPUs and is
2050 * not a real device DMA, it is not necessary to pin the user space memory.
2052 * The caller needs to call vfio_group_get_external_user() or
2053 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2054 * so as to prevent the VFIO group from disposal in the middle of the call.
2055 * But it can keep the reference to the VFIO group for several calls into
2057 * After finishing using of the VFIO group, the caller needs to release the
2058 * VFIO group by calling vfio_group_put_external_user().
2060 * @group [in] : VFIO group
2061 * @user_iova [in] : base IOVA of a user space buffer
2062 * @data [in] : pointer to kernel buffer
2063 * @len [in] : kernel buffer length
2064 * @write : indicate read or write
2065 * Return error code on failure or 0 on success.
2067 int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2068 void *data, size_t len, bool write)
2070 struct vfio_container *container;
2071 struct vfio_iommu_driver *driver;
2074 if (!group || !data || len <= 0)
2077 container = group->container;
2078 driver = container->iommu_driver;
2080 if (likely(driver && driver->ops->dma_rw))
2081 ret = driver->ops->dma_rw(container->iommu_data,
2082 user_iova, data, len, write);
2088 EXPORT_SYMBOL(vfio_dma_rw);
2090 static int vfio_register_iommu_notifier(struct vfio_group *group,
2091 unsigned long *events,
2092 struct notifier_block *nb)
2094 struct vfio_container *container;
2095 struct vfio_iommu_driver *driver;
2098 ret = vfio_group_add_container_user(group);
2102 container = group->container;
2103 driver = container->iommu_driver;
2104 if (likely(driver && driver->ops->register_notifier))
2105 ret = driver->ops->register_notifier(container->iommu_data,
2110 vfio_group_try_dissolve_container(group);
2115 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2116 struct notifier_block *nb)
2118 struct vfio_container *container;
2119 struct vfio_iommu_driver *driver;
2122 ret = vfio_group_add_container_user(group);
2126 container = group->container;
2127 driver = container->iommu_driver;
2128 if (likely(driver && driver->ops->unregister_notifier))
2129 ret = driver->ops->unregister_notifier(container->iommu_data,
2134 vfio_group_try_dissolve_container(group);
2139 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2142 blocking_notifier_call_chain(&group->notifier,
2143 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2145 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2147 static int vfio_register_group_notifier(struct vfio_group *group,
2148 unsigned long *events,
2149 struct notifier_block *nb)
2152 bool set_kvm = false;
2154 if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2157 /* clear known events */
2158 *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2160 /* refuse to continue if still events remaining */
2164 ret = vfio_group_add_container_user(group);
2168 ret = blocking_notifier_chain_register(&group->notifier, nb);
2171 * The attaching of kvm and vfio_group might already happen, so
2172 * here we replay once upon registration.
2174 if (!ret && set_kvm && group->kvm)
2175 blocking_notifier_call_chain(&group->notifier,
2176 VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2178 vfio_group_try_dissolve_container(group);
2183 static int vfio_unregister_group_notifier(struct vfio_group *group,
2184 struct notifier_block *nb)
2188 ret = vfio_group_add_container_user(group);
2192 ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2194 vfio_group_try_dissolve_container(group);
2199 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2200 unsigned long *events, struct notifier_block *nb)
2202 struct vfio_group *group;
2205 if (!dev || !nb || !events || (*events == 0))
2208 group = vfio_group_get_from_dev(dev);
2213 case VFIO_IOMMU_NOTIFY:
2214 ret = vfio_register_iommu_notifier(group, events, nb);
2216 case VFIO_GROUP_NOTIFY:
2217 ret = vfio_register_group_notifier(group, events, nb);
2223 vfio_group_put(group);
2226 EXPORT_SYMBOL(vfio_register_notifier);
2228 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2229 struct notifier_block *nb)
2231 struct vfio_group *group;
2237 group = vfio_group_get_from_dev(dev);
2242 case VFIO_IOMMU_NOTIFY:
2243 ret = vfio_unregister_iommu_notifier(group, nb);
2245 case VFIO_GROUP_NOTIFY:
2246 ret = vfio_unregister_group_notifier(group, nb);
2252 vfio_group_put(group);
2255 EXPORT_SYMBOL(vfio_unregister_notifier);
2257 struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2259 struct vfio_container *container;
2260 struct vfio_iommu_driver *driver;
2263 return ERR_PTR(-EINVAL);
2265 container = group->container;
2266 driver = container->iommu_driver;
2267 if (likely(driver && driver->ops->group_iommu_domain))
2268 return driver->ops->group_iommu_domain(container->iommu_data,
2269 group->iommu_group);
2271 return ERR_PTR(-ENOTTY);
2273 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2276 * Module/class support
2278 static char *vfio_devnode(struct device *dev, umode_t *mode)
2280 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2283 static struct miscdevice vfio_dev = {
2284 .minor = VFIO_MINOR,
2287 .nodename = "vfio/vfio",
2288 .mode = S_IRUGO | S_IWUGO,
2291 static int __init vfio_init(void)
2295 ida_init(&vfio.group_ida);
2296 mutex_init(&vfio.group_lock);
2297 mutex_init(&vfio.iommu_drivers_lock);
2298 INIT_LIST_HEAD(&vfio.group_list);
2299 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2301 ret = misc_register(&vfio_dev);
2303 pr_err("vfio: misc device register failed\n");
2307 /* /dev/vfio/$GROUP */
2308 vfio.class = class_create(THIS_MODULE, "vfio");
2309 if (IS_ERR(vfio.class)) {
2310 ret = PTR_ERR(vfio.class);
2314 vfio.class->devnode = vfio_devnode;
2316 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2318 goto err_alloc_chrdev;
2320 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2322 #ifdef CONFIG_VFIO_NOIOMMU
2323 vfio_register_iommu_driver(&vfio_noiommu_ops);
2328 class_destroy(vfio.class);
2331 misc_deregister(&vfio_dev);
2335 static void __exit vfio_cleanup(void)
2337 WARN_ON(!list_empty(&vfio.group_list));
2339 #ifdef CONFIG_VFIO_NOIOMMU
2340 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2342 ida_destroy(&vfio.group_ida);
2343 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2344 class_destroy(vfio.class);
2346 misc_deregister(&vfio_dev);
2347 xa_destroy(&vfio_device_set_xa);
2350 module_init(vfio_init);
2351 module_exit(vfio_cleanup);
2353 MODULE_VERSION(DRIVER_VERSION);
2354 MODULE_LICENSE("GPL v2");
2355 MODULE_AUTHOR(DRIVER_AUTHOR);
2356 MODULE_DESCRIPTION(DRIVER_DESC);
2357 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2358 MODULE_ALIAS("devname:vfio/vfio");
2359 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");