1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
37 #define DRIVER_VERSION "0.3"
38 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC "VFIO - User Level meta-driver"
43 struct list_head iommu_drivers_list;
44 struct mutex iommu_drivers_lock;
45 struct list_head group_list;
46 struct mutex group_lock; /* locks group_list */
51 struct vfio_iommu_driver {
52 const struct vfio_iommu_driver_ops *ops;
53 struct list_head vfio_next;
56 struct vfio_container {
58 struct list_head group_list;
59 struct rw_semaphore group_lock;
60 struct vfio_iommu_driver *iommu_driver;
69 unsigned int container_users;
70 struct iommu_group *iommu_group;
71 struct vfio_container *container;
72 struct list_head device_list;
73 struct mutex device_lock;
74 struct list_head vfio_next;
75 struct list_head container_next;
76 enum vfio_group_type type;
77 unsigned int dev_counter;
78 struct rw_semaphore group_rwsem;
80 struct file *opened_file;
81 struct blocking_notifier_head notifier;
84 #ifdef CONFIG_VFIO_NOIOMMU
85 static bool noiommu __read_mostly;
86 module_param_named(enable_unsafe_noiommu_mode,
87 noiommu, bool, S_IRUGO | S_IWUSR);
88 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
91 static DEFINE_XARRAY(vfio_device_set_xa);
92 static const struct file_operations vfio_group_fops;
94 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
96 unsigned long idx = (unsigned long)set_id;
97 struct vfio_device_set *new_dev_set;
98 struct vfio_device_set *dev_set;
100 if (WARN_ON(!set_id))
104 * Atomically acquire a singleton object in the xarray for this set_id
106 xa_lock(&vfio_device_set_xa);
107 dev_set = xa_load(&vfio_device_set_xa, idx);
110 xa_unlock(&vfio_device_set_xa);
112 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
115 mutex_init(&new_dev_set->lock);
116 INIT_LIST_HEAD(&new_dev_set->device_list);
117 new_dev_set->set_id = set_id;
119 xa_lock(&vfio_device_set_xa);
120 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
123 dev_set = new_dev_set;
128 if (xa_is_err(dev_set)) {
129 xa_unlock(&vfio_device_set_xa);
130 return xa_err(dev_set);
134 dev_set->device_count++;
135 xa_unlock(&vfio_device_set_xa);
136 mutex_lock(&dev_set->lock);
137 device->dev_set = dev_set;
138 list_add_tail(&device->dev_set_list, &dev_set->device_list);
139 mutex_unlock(&dev_set->lock);
142 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
144 static void vfio_release_device_set(struct vfio_device *device)
146 struct vfio_device_set *dev_set = device->dev_set;
151 mutex_lock(&dev_set->lock);
152 list_del(&device->dev_set_list);
153 mutex_unlock(&dev_set->lock);
155 xa_lock(&vfio_device_set_xa);
156 if (!--dev_set->device_count) {
157 __xa_erase(&vfio_device_set_xa,
158 (unsigned long)dev_set->set_id);
159 mutex_destroy(&dev_set->lock);
162 xa_unlock(&vfio_device_set_xa);
165 #ifdef CONFIG_VFIO_NOIOMMU
166 static void *vfio_noiommu_open(unsigned long arg)
168 if (arg != VFIO_NOIOMMU_IOMMU)
169 return ERR_PTR(-EINVAL);
170 if (!capable(CAP_SYS_RAWIO))
171 return ERR_PTR(-EPERM);
176 static void vfio_noiommu_release(void *iommu_data)
180 static long vfio_noiommu_ioctl(void *iommu_data,
181 unsigned int cmd, unsigned long arg)
183 if (cmd == VFIO_CHECK_EXTENSION)
184 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
189 static int vfio_noiommu_attach_group(void *iommu_data,
190 struct iommu_group *iommu_group, enum vfio_group_type type)
195 static void vfio_noiommu_detach_group(void *iommu_data,
196 struct iommu_group *iommu_group)
200 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
201 .name = "vfio-noiommu",
202 .owner = THIS_MODULE,
203 .open = vfio_noiommu_open,
204 .release = vfio_noiommu_release,
205 .ioctl = vfio_noiommu_ioctl,
206 .attach_group = vfio_noiommu_attach_group,
207 .detach_group = vfio_noiommu_detach_group,
211 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
214 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
215 const struct vfio_iommu_driver *driver)
217 return container->noiommu == (driver->ops == &vfio_noiommu_ops);
220 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
221 const struct vfio_iommu_driver *driver)
225 #endif /* CONFIG_VFIO_NOIOMMU */
228 * IOMMU driver registration
230 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
232 struct vfio_iommu_driver *driver, *tmp;
234 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
240 mutex_lock(&vfio.iommu_drivers_lock);
242 /* Check for duplicates */
243 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
244 if (tmp->ops == ops) {
245 mutex_unlock(&vfio.iommu_drivers_lock);
251 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
253 mutex_unlock(&vfio.iommu_drivers_lock);
257 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
259 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
261 struct vfio_iommu_driver *driver;
263 mutex_lock(&vfio.iommu_drivers_lock);
264 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
265 if (driver->ops == ops) {
266 list_del(&driver->vfio_next);
267 mutex_unlock(&vfio.iommu_drivers_lock);
272 mutex_unlock(&vfio.iommu_drivers_lock);
274 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
276 static void vfio_group_get(struct vfio_group *group);
279 * Container objects - containers are created when /dev/vfio/vfio is
280 * opened, but their lifecycle extends until the last user is done, so
281 * it's freed via kref. Must support container/group/device being
282 * closed in any order.
284 static void vfio_container_get(struct vfio_container *container)
286 kref_get(&container->kref);
289 static void vfio_container_release(struct kref *kref)
291 struct vfio_container *container;
292 container = container_of(kref, struct vfio_container, kref);
297 static void vfio_container_put(struct vfio_container *container)
299 kref_put(&container->kref, vfio_container_release);
303 * Group objects - create, release, get, put, search
305 static struct vfio_group *
306 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
308 struct vfio_group *group;
310 list_for_each_entry(group, &vfio.group_list, vfio_next) {
311 if (group->iommu_group == iommu_group) {
312 vfio_group_get(group);
319 static struct vfio_group *
320 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
322 struct vfio_group *group;
324 mutex_lock(&vfio.group_lock);
325 group = __vfio_group_get_from_iommu(iommu_group);
326 mutex_unlock(&vfio.group_lock);
330 static void vfio_group_release(struct device *dev)
332 struct vfio_group *group = container_of(dev, struct vfio_group, dev);
334 mutex_destroy(&group->device_lock);
335 iommu_group_put(group->iommu_group);
336 ida_free(&vfio.group_ida, MINOR(group->dev.devt));
340 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
341 enum vfio_group_type type)
343 struct vfio_group *group;
346 group = kzalloc(sizeof(*group), GFP_KERNEL);
348 return ERR_PTR(-ENOMEM);
350 minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
353 return ERR_PTR(minor);
356 device_initialize(&group->dev);
357 group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
358 group->dev.class = vfio.class;
359 group->dev.release = vfio_group_release;
360 cdev_init(&group->cdev, &vfio_group_fops);
361 group->cdev.owner = THIS_MODULE;
363 refcount_set(&group->users, 1);
364 init_rwsem(&group->group_rwsem);
365 INIT_LIST_HEAD(&group->device_list);
366 mutex_init(&group->device_lock);
367 group->iommu_group = iommu_group;
368 /* put in vfio_group_release() */
369 iommu_group_ref_get(iommu_group);
371 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
376 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
377 enum vfio_group_type type)
379 struct vfio_group *group;
380 struct vfio_group *ret;
383 group = vfio_group_alloc(iommu_group, type);
387 err = dev_set_name(&group->dev, "%s%d",
388 group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
389 iommu_group_id(iommu_group));
395 mutex_lock(&vfio.group_lock);
397 /* Did we race creating this group? */
398 ret = __vfio_group_get_from_iommu(iommu_group);
402 err = cdev_device_add(&group->cdev, &group->dev);
408 list_add(&group->vfio_next, &vfio.group_list);
410 mutex_unlock(&vfio.group_lock);
414 mutex_unlock(&vfio.group_lock);
416 put_device(&group->dev);
420 static void vfio_group_put(struct vfio_group *group)
422 if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
426 * These data structures all have paired operations that can only be
427 * undone when the caller holds a live reference on the group. Since all
428 * pairs must be undone these WARN_ON's indicate some caller did not
429 * properly hold the group reference.
431 WARN_ON(!list_empty(&group->device_list));
432 WARN_ON(group->container || group->container_users);
433 WARN_ON(group->notifier.head);
435 list_del(&group->vfio_next);
436 cdev_device_del(&group->cdev, &group->dev);
437 mutex_unlock(&vfio.group_lock);
439 put_device(&group->dev);
442 static void vfio_group_get(struct vfio_group *group)
444 refcount_inc(&group->users);
448 * Device objects - create, release, get, put, search
450 /* Device reference always implies a group reference */
451 static void vfio_device_put(struct vfio_device *device)
453 if (refcount_dec_and_test(&device->refcount))
454 complete(&device->comp);
457 static bool vfio_device_try_get(struct vfio_device *device)
459 return refcount_inc_not_zero(&device->refcount);
462 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
465 struct vfio_device *device;
467 mutex_lock(&group->device_lock);
468 list_for_each_entry(device, &group->device_list, group_next) {
469 if (device->dev == dev && vfio_device_try_get(device)) {
470 mutex_unlock(&group->device_lock);
474 mutex_unlock(&group->device_lock);
481 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
482 const struct vfio_device_ops *ops)
484 init_completion(&device->comp);
488 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
490 void vfio_uninit_group_dev(struct vfio_device *device)
492 vfio_release_device_set(device);
494 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
496 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
497 enum vfio_group_type type)
499 struct iommu_group *iommu_group;
500 struct vfio_group *group;
503 iommu_group = iommu_group_alloc();
504 if (IS_ERR(iommu_group))
505 return ERR_CAST(iommu_group);
507 iommu_group_set_name(iommu_group, "vfio-noiommu");
508 ret = iommu_group_add_device(iommu_group, dev);
512 group = vfio_create_group(iommu_group, type);
514 ret = PTR_ERR(group);
515 goto out_remove_device;
517 iommu_group_put(iommu_group);
521 iommu_group_remove_device(dev);
523 iommu_group_put(iommu_group);
527 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
529 struct iommu_group *iommu_group;
530 struct vfio_group *group;
532 iommu_group = iommu_group_get(dev);
533 #ifdef CONFIG_VFIO_NOIOMMU
534 if (!iommu_group && noiommu) {
536 * With noiommu enabled, create an IOMMU group for devices that
537 * don't already have one, implying no IOMMU hardware/driver
538 * exists. Taint the kernel because we're about to give a DMA
539 * capable device to a user without IOMMU protection.
541 group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
542 if (!IS_ERR(group)) {
543 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
544 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
550 return ERR_PTR(-EINVAL);
552 group = vfio_group_get_from_iommu(iommu_group);
554 group = vfio_create_group(iommu_group, VFIO_IOMMU);
556 /* The vfio_group holds a reference to the iommu_group */
557 iommu_group_put(iommu_group);
561 static int __vfio_register_dev(struct vfio_device *device,
562 struct vfio_group *group)
564 struct vfio_device *existing_device;
567 return PTR_ERR(group);
570 * If the driver doesn't specify a set then the device is added to a
571 * singleton set just for itself.
573 if (!device->dev_set)
574 vfio_assign_device_set(device, device);
576 existing_device = vfio_group_get_device(group, device->dev);
577 if (existing_device) {
578 dev_WARN(device->dev, "Device already exists on group %d\n",
579 iommu_group_id(group->iommu_group));
580 vfio_device_put(existing_device);
581 if (group->type == VFIO_NO_IOMMU ||
582 group->type == VFIO_EMULATED_IOMMU)
583 iommu_group_remove_device(device->dev);
584 vfio_group_put(group);
588 /* Our reference on group is moved to the device */
589 device->group = group;
591 /* Refcounting can't start until the driver calls register */
592 refcount_set(&device->refcount, 1);
594 mutex_lock(&group->device_lock);
595 list_add(&device->group_next, &group->device_list);
596 group->dev_counter++;
597 mutex_unlock(&group->device_lock);
602 int vfio_register_group_dev(struct vfio_device *device)
605 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
606 * restore cache coherency.
608 if (!iommu_capable(device->dev->bus, IOMMU_CAP_CACHE_COHERENCY))
611 return __vfio_register_dev(device,
612 vfio_group_find_or_alloc(device->dev));
614 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
617 * Register a virtual device without IOMMU backing. The user of this
618 * device must not be able to directly trigger unmediated DMA.
620 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
622 return __vfio_register_dev(device,
623 vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
625 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
627 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
630 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
632 mutex_lock(&group->device_lock);
633 list_for_each_entry(it, &group->device_list, group_next) {
636 if (it->ops->match) {
637 ret = it->ops->match(it, buf);
639 device = ERR_PTR(ret);
643 ret = !strcmp(dev_name(it->dev), buf);
646 if (ret && vfio_device_try_get(it)) {
651 mutex_unlock(&group->device_lock);
657 * Decrement the device reference count and wait for the device to be
658 * removed. Open file descriptors for the device... */
659 void vfio_unregister_group_dev(struct vfio_device *device)
661 struct vfio_group *group = device->group;
663 bool interrupted = false;
666 vfio_device_put(device);
667 rc = try_wait_for_completion(&device->comp);
669 if (device->ops->request)
670 device->ops->request(device, i++);
673 rc = wait_for_completion_timeout(&device->comp,
676 rc = wait_for_completion_interruptible_timeout(
677 &device->comp, HZ * 10);
680 dev_warn(device->dev,
681 "Device is currently in use, task"
683 "blocked until device is released",
684 current->comm, task_pid_nr(current));
689 mutex_lock(&group->device_lock);
690 list_del(&device->group_next);
691 group->dev_counter--;
692 mutex_unlock(&group->device_lock);
694 if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
695 iommu_group_remove_device(device->dev);
697 /* Matches the get in vfio_register_group_dev() */
698 vfio_group_put(group);
700 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
703 * VFIO base fd, /dev/vfio/vfio
705 static long vfio_ioctl_check_extension(struct vfio_container *container,
708 struct vfio_iommu_driver *driver;
711 down_read(&container->group_lock);
713 driver = container->iommu_driver;
716 /* No base extensions yet */
719 * If no driver is set, poll all registered drivers for
720 * extensions and return the first positive result. If
721 * a driver is already set, further queries will be passed
722 * only to that driver.
725 mutex_lock(&vfio.iommu_drivers_lock);
726 list_for_each_entry(driver, &vfio.iommu_drivers_list,
729 if (!list_empty(&container->group_list) &&
730 !vfio_iommu_driver_allowed(container,
733 if (!try_module_get(driver->ops->owner))
736 ret = driver->ops->ioctl(NULL,
737 VFIO_CHECK_EXTENSION,
739 module_put(driver->ops->owner);
743 mutex_unlock(&vfio.iommu_drivers_lock);
745 ret = driver->ops->ioctl(container->iommu_data,
746 VFIO_CHECK_EXTENSION, arg);
749 up_read(&container->group_lock);
754 /* hold write lock on container->group_lock */
755 static int __vfio_container_attach_groups(struct vfio_container *container,
756 struct vfio_iommu_driver *driver,
759 struct vfio_group *group;
762 list_for_each_entry(group, &container->group_list, container_next) {
763 ret = driver->ops->attach_group(data, group->iommu_group,
772 list_for_each_entry_continue_reverse(group, &container->group_list,
774 driver->ops->detach_group(data, group->iommu_group);
780 static long vfio_ioctl_set_iommu(struct vfio_container *container,
783 struct vfio_iommu_driver *driver;
786 down_write(&container->group_lock);
789 * The container is designed to be an unprivileged interface while
790 * the group can be assigned to specific users. Therefore, only by
791 * adding a group to a container does the user get the privilege of
792 * enabling the iommu, which may allocate finite resources. There
793 * is no unset_iommu, but by removing all the groups from a container,
794 * the container is deprivileged and returns to an unset state.
796 if (list_empty(&container->group_list) || container->iommu_driver) {
797 up_write(&container->group_lock);
801 mutex_lock(&vfio.iommu_drivers_lock);
802 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
805 if (!vfio_iommu_driver_allowed(container, driver))
807 if (!try_module_get(driver->ops->owner))
811 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
812 * so test which iommu driver reported support for this
813 * extension and call open on them. We also pass them the
814 * magic, allowing a single driver to support multiple
815 * interfaces if they'd like.
817 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
818 module_put(driver->ops->owner);
822 data = driver->ops->open(arg);
825 module_put(driver->ops->owner);
829 ret = __vfio_container_attach_groups(container, driver, data);
831 driver->ops->release(data);
832 module_put(driver->ops->owner);
836 container->iommu_driver = driver;
837 container->iommu_data = data;
841 mutex_unlock(&vfio.iommu_drivers_lock);
842 up_write(&container->group_lock);
847 static long vfio_fops_unl_ioctl(struct file *filep,
848 unsigned int cmd, unsigned long arg)
850 struct vfio_container *container = filep->private_data;
851 struct vfio_iommu_driver *driver;
859 case VFIO_GET_API_VERSION:
860 ret = VFIO_API_VERSION;
862 case VFIO_CHECK_EXTENSION:
863 ret = vfio_ioctl_check_extension(container, arg);
866 ret = vfio_ioctl_set_iommu(container, arg);
869 driver = container->iommu_driver;
870 data = container->iommu_data;
872 if (driver) /* passthrough all unrecognized ioctls */
873 ret = driver->ops->ioctl(data, cmd, arg);
879 static int vfio_fops_open(struct inode *inode, struct file *filep)
881 struct vfio_container *container;
883 container = kzalloc(sizeof(*container), GFP_KERNEL);
887 INIT_LIST_HEAD(&container->group_list);
888 init_rwsem(&container->group_lock);
889 kref_init(&container->kref);
891 filep->private_data = container;
896 static int vfio_fops_release(struct inode *inode, struct file *filep)
898 struct vfio_container *container = filep->private_data;
899 struct vfio_iommu_driver *driver = container->iommu_driver;
901 if (driver && driver->ops->notify)
902 driver->ops->notify(container->iommu_data,
903 VFIO_IOMMU_CONTAINER_CLOSE);
905 filep->private_data = NULL;
907 vfio_container_put(container);
912 static const struct file_operations vfio_fops = {
913 .owner = THIS_MODULE,
914 .open = vfio_fops_open,
915 .release = vfio_fops_release,
916 .unlocked_ioctl = vfio_fops_unl_ioctl,
917 .compat_ioctl = compat_ptr_ioctl,
921 * VFIO Group fd, /dev/vfio/$GROUP
923 static void __vfio_group_unset_container(struct vfio_group *group)
925 struct vfio_container *container = group->container;
926 struct vfio_iommu_driver *driver;
928 lockdep_assert_held_write(&group->group_rwsem);
930 down_write(&container->group_lock);
932 driver = container->iommu_driver;
934 driver->ops->detach_group(container->iommu_data,
937 if (group->type == VFIO_IOMMU)
938 iommu_group_release_dma_owner(group->iommu_group);
940 group->container = NULL;
941 group->container_users = 0;
942 list_del(&group->container_next);
944 /* Detaching the last group deprivileges a container, remove iommu */
945 if (driver && list_empty(&container->group_list)) {
946 driver->ops->release(container->iommu_data);
947 module_put(driver->ops->owner);
948 container->iommu_driver = NULL;
949 container->iommu_data = NULL;
952 up_write(&container->group_lock);
954 vfio_container_put(container);
958 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
959 * if there was no container to unset. Since the ioctl is called on
960 * the group, we know that still exists, therefore the only valid
961 * transition here is 1->0.
963 static int vfio_group_unset_container(struct vfio_group *group)
965 lockdep_assert_held_write(&group->group_rwsem);
967 if (!group->container)
969 if (group->container_users != 1)
971 __vfio_group_unset_container(group);
975 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
978 struct vfio_container *container;
979 struct vfio_iommu_driver *driver;
982 lockdep_assert_held_write(&group->group_rwsem);
984 if (group->container || WARN_ON(group->container_users))
987 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
990 f = fdget(container_fd);
994 /* Sanity check, is this really our fd? */
995 if (f.file->f_op != &vfio_fops) {
1000 container = f.file->private_data;
1001 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1003 down_write(&container->group_lock);
1005 /* Real groups and fake groups cannot mix */
1006 if (!list_empty(&container->group_list) &&
1007 container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1012 if (group->type == VFIO_IOMMU) {
1013 ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1018 driver = container->iommu_driver;
1020 ret = driver->ops->attach_group(container->iommu_data,
1024 if (group->type == VFIO_IOMMU)
1025 iommu_group_release_dma_owner(
1026 group->iommu_group);
1031 group->container = container;
1032 group->container_users = 1;
1033 container->noiommu = (group->type == VFIO_NO_IOMMU);
1034 list_add(&group->container_next, &container->group_list);
1036 /* Get a reference on the container and mark a user within the group */
1037 vfio_container_get(container);
1040 up_write(&container->group_lock);
1045 static const struct file_operations vfio_device_fops;
1047 /* true if the vfio_device has open_device() called but not close_device() */
1048 static bool vfio_assert_device_open(struct vfio_device *device)
1050 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1053 static int vfio_device_assign_container(struct vfio_device *device)
1055 struct vfio_group *group = device->group;
1057 lockdep_assert_held_write(&group->group_rwsem);
1059 if (!group->container || !group->container->iommu_driver ||
1060 WARN_ON(!group->container_users))
1063 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1066 get_file(group->opened_file);
1067 group->container_users++;
1071 static void vfio_device_unassign_container(struct vfio_device *device)
1073 down_write(&device->group->group_rwsem);
1074 WARN_ON(device->group->container_users <= 1);
1075 device->group->container_users--;
1076 fput(device->group->opened_file);
1077 up_write(&device->group->group_rwsem);
1080 static struct file *vfio_device_open(struct vfio_device *device)
1085 down_write(&device->group->group_rwsem);
1086 ret = vfio_device_assign_container(device);
1087 up_write(&device->group->group_rwsem);
1089 return ERR_PTR(ret);
1091 if (!try_module_get(device->dev->driver->owner)) {
1093 goto err_unassign_container;
1096 mutex_lock(&device->dev_set->lock);
1097 device->open_count++;
1098 if (device->open_count == 1) {
1100 * Here we pass the KVM pointer with the group under the read
1101 * lock. If the device driver will use it, it must obtain a
1102 * reference and release it during close_device.
1104 down_read(&device->group->group_rwsem);
1105 device->kvm = device->group->kvm;
1107 if (device->ops->open_device) {
1108 ret = device->ops->open_device(device);
1110 goto err_undo_count;
1112 up_read(&device->group->group_rwsem);
1114 mutex_unlock(&device->dev_set->lock);
1117 * We can't use anon_inode_getfd() because we need to modify
1118 * the f_mode flags directly to allow more than just ioctls
1120 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1122 if (IS_ERR(filep)) {
1123 ret = PTR_ERR(filep);
1124 goto err_close_device;
1128 * TODO: add an anon_inode interface to do this.
1129 * Appears to be missing by lack of need rather than
1130 * explicitly prevented. Now there's need.
1132 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1134 if (device->group->type == VFIO_NO_IOMMU)
1135 dev_warn(device->dev, "vfio-noiommu device opened by user "
1136 "(%s:%d)\n", current->comm, task_pid_nr(current));
1138 * On success the ref of device is moved to the file and
1139 * put in vfio_device_fops_release()
1144 mutex_lock(&device->dev_set->lock);
1145 down_read(&device->group->group_rwsem);
1146 if (device->open_count == 1 && device->ops->close_device)
1147 device->ops->close_device(device);
1149 device->open_count--;
1150 if (device->open_count == 0 && device->kvm)
1152 up_read(&device->group->group_rwsem);
1153 mutex_unlock(&device->dev_set->lock);
1154 module_put(device->dev->driver->owner);
1155 err_unassign_container:
1156 vfio_device_unassign_container(device);
1157 return ERR_PTR(ret);
1160 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1162 struct vfio_device *device;
1167 device = vfio_device_get_from_name(group, buf);
1169 return PTR_ERR(device);
1171 fdno = get_unused_fd_flags(O_CLOEXEC);
1174 goto err_put_device;
1177 filep = vfio_device_open(device);
1178 if (IS_ERR(filep)) {
1179 ret = PTR_ERR(filep);
1183 fd_install(fdno, filep);
1187 put_unused_fd(fdno);
1189 vfio_device_put(device);
1193 static long vfio_group_fops_unl_ioctl(struct file *filep,
1194 unsigned int cmd, unsigned long arg)
1196 struct vfio_group *group = filep->private_data;
1200 case VFIO_GROUP_GET_STATUS:
1202 struct vfio_group_status status;
1203 unsigned long minsz;
1205 minsz = offsetofend(struct vfio_group_status, flags);
1207 if (copy_from_user(&status, (void __user *)arg, minsz))
1210 if (status.argsz < minsz)
1215 down_read(&group->group_rwsem);
1216 if (group->container)
1217 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1218 VFIO_GROUP_FLAGS_VIABLE;
1219 else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1220 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1221 up_read(&group->group_rwsem);
1223 if (copy_to_user((void __user *)arg, &status, minsz))
1229 case VFIO_GROUP_SET_CONTAINER:
1233 if (get_user(fd, (int __user *)arg))
1239 down_write(&group->group_rwsem);
1240 ret = vfio_group_set_container(group, fd);
1241 up_write(&group->group_rwsem);
1244 case VFIO_GROUP_UNSET_CONTAINER:
1245 down_write(&group->group_rwsem);
1246 ret = vfio_group_unset_container(group);
1247 up_write(&group->group_rwsem);
1249 case VFIO_GROUP_GET_DEVICE_FD:
1253 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1255 return PTR_ERR(buf);
1257 ret = vfio_group_get_device_fd(group, buf);
1266 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1268 struct vfio_group *group =
1269 container_of(inode->i_cdev, struct vfio_group, cdev);
1272 down_write(&group->group_rwsem);
1274 /* users can be zero if this races with vfio_group_put() */
1275 if (!refcount_inc_not_zero(&group->users)) {
1280 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1286 * Do we need multiple instances of the group open? Seems not.
1288 if (group->opened_file) {
1292 group->opened_file = filep;
1293 filep->private_data = group;
1295 up_write(&group->group_rwsem);
1298 vfio_group_put(group);
1300 up_write(&group->group_rwsem);
1304 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1306 struct vfio_group *group = filep->private_data;
1308 filep->private_data = NULL;
1310 down_write(&group->group_rwsem);
1312 * Device FDs hold a group file reference, therefore the group release
1313 * is only called when there are no open devices.
1315 WARN_ON(group->notifier.head);
1316 if (group->container) {
1317 WARN_ON(group->container_users != 1);
1318 __vfio_group_unset_container(group);
1320 group->opened_file = NULL;
1321 up_write(&group->group_rwsem);
1323 vfio_group_put(group);
1328 static const struct file_operations vfio_group_fops = {
1329 .owner = THIS_MODULE,
1330 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1331 .compat_ioctl = compat_ptr_ioctl,
1332 .open = vfio_group_fops_open,
1333 .release = vfio_group_fops_release,
1339 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1341 struct vfio_device *device = filep->private_data;
1343 mutex_lock(&device->dev_set->lock);
1344 vfio_assert_device_open(device);
1345 down_read(&device->group->group_rwsem);
1346 if (device->open_count == 1 && device->ops->close_device)
1347 device->ops->close_device(device);
1348 up_read(&device->group->group_rwsem);
1349 device->open_count--;
1350 if (device->open_count == 0)
1352 mutex_unlock(&device->dev_set->lock);
1354 module_put(device->dev->driver->owner);
1356 vfio_device_unassign_container(device);
1358 vfio_device_put(device);
1364 * vfio_mig_get_next_state - Compute the next step in the FSM
1365 * @cur_fsm - The current state the device is in
1366 * @new_fsm - The target state to reach
1367 * @next_fsm - Pointer to the next step to get to new_fsm
1369 * Return 0 upon success, otherwise -errno
1370 * Upon success the next step in the state progression between cur_fsm and
1371 * new_fsm will be set in next_fsm.
1373 * This breaks down requests for combination transitions into smaller steps and
1374 * returns the next step to get to new_fsm. The function may need to be called
1375 * multiple times before reaching new_fsm.
1378 int vfio_mig_get_next_state(struct vfio_device *device,
1379 enum vfio_device_mig_state cur_fsm,
1380 enum vfio_device_mig_state new_fsm,
1381 enum vfio_device_mig_state *next_fsm)
1383 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1385 * The coding in this table requires the driver to implement the
1386 * following FSM arcs:
1392 * If P2P is supported then the driver must also implement these FSM
1394 * RUNNING -> RUNNING_P2P
1395 * RUNNING_P2P -> RUNNING
1396 * RUNNING_P2P -> STOP
1397 * STOP -> RUNNING_P2P
1398 * Without P2P the driver must implement:
1402 * The coding will step through multiple states for some combination
1403 * transitions; if all optional features are supported, this means the
1405 * RESUMING -> STOP -> RUNNING_P2P
1406 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1407 * RESUMING -> STOP -> STOP_COPY
1408 * RUNNING -> RUNNING_P2P -> STOP
1409 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1410 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1411 * RUNNING_P2P -> STOP -> RESUMING
1412 * RUNNING_P2P -> STOP -> STOP_COPY
1413 * STOP -> RUNNING_P2P -> RUNNING
1414 * STOP_COPY -> STOP -> RESUMING
1415 * STOP_COPY -> STOP -> RUNNING_P2P
1416 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1418 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1419 [VFIO_DEVICE_STATE_STOP] = {
1420 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1421 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1422 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1423 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1424 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1425 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1427 [VFIO_DEVICE_STATE_RUNNING] = {
1428 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1429 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1430 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1431 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1432 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1433 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1435 [VFIO_DEVICE_STATE_STOP_COPY] = {
1436 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1437 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1438 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1439 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1440 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1441 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1443 [VFIO_DEVICE_STATE_RESUMING] = {
1444 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1445 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1446 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1447 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1448 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1449 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1451 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
1452 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1453 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1454 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1455 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1456 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1457 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1459 [VFIO_DEVICE_STATE_ERROR] = {
1460 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1461 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1462 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1463 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1464 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1465 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1469 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1470 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1471 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1472 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1473 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1474 [VFIO_DEVICE_STATE_RUNNING_P2P] =
1475 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1476 [VFIO_DEVICE_STATE_ERROR] = ~0U,
1479 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1480 (state_flags_table[cur_fsm] & device->migration_flags) !=
1481 state_flags_table[cur_fsm]))
1484 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1485 (state_flags_table[new_fsm] & device->migration_flags) !=
1486 state_flags_table[new_fsm])
1490 * Arcs touching optional and unsupported states are skipped over. The
1491 * driver will instead see an arc from the original state to the next
1492 * logical state, as per the above comment.
1494 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1495 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1496 state_flags_table[*next_fsm])
1497 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1499 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1501 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1504 * Convert the drivers's struct file into a FD number and return it to userspace
1506 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1507 struct vfio_device_feature_mig_state *mig)
1512 fd = get_unused_fd_flags(O_CLOEXEC);
1519 if (copy_to_user(arg, mig, sizeof(*mig))) {
1521 goto out_put_unused;
1523 fd_install(fd, filp);
1534 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1535 u32 flags, void __user *arg,
1539 offsetofend(struct vfio_device_feature_mig_state, data_fd);
1540 struct vfio_device_feature_mig_state mig;
1541 struct file *filp = NULL;
1544 if (!device->ops->migration_set_state ||
1545 !device->ops->migration_get_state)
1548 ret = vfio_check_feature(flags, argsz,
1549 VFIO_DEVICE_FEATURE_SET |
1550 VFIO_DEVICE_FEATURE_GET,
1555 if (copy_from_user(&mig, arg, minsz))
1558 if (flags & VFIO_DEVICE_FEATURE_GET) {
1559 enum vfio_device_mig_state curr_state;
1561 ret = device->ops->migration_get_state(device, &curr_state);
1564 mig.device_state = curr_state;
1568 /* Handle the VFIO_DEVICE_FEATURE_SET */
1569 filp = device->ops->migration_set_state(device, mig.device_state);
1570 if (IS_ERR(filp) || !filp)
1573 return vfio_ioct_mig_return_fd(filp, arg, &mig);
1576 if (copy_to_user(arg, &mig, sizeof(mig)))
1579 return PTR_ERR(filp);
1583 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1584 u32 flags, void __user *arg,
1587 struct vfio_device_feature_migration mig = {
1588 .flags = device->migration_flags,
1592 if (!device->ops->migration_set_state ||
1593 !device->ops->migration_get_state)
1596 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1600 if (copy_to_user(arg, &mig, sizeof(mig)))
1605 static int vfio_ioctl_device_feature(struct vfio_device *device,
1606 struct vfio_device_feature __user *arg)
1608 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1609 struct vfio_device_feature feature;
1611 if (copy_from_user(&feature, arg, minsz))
1614 if (feature.argsz < minsz)
1617 /* Check unknown flags */
1619 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1620 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1623 /* GET & SET are mutually exclusive except with PROBE */
1624 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1625 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1626 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1629 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1630 case VFIO_DEVICE_FEATURE_MIGRATION:
1631 return vfio_ioctl_device_feature_migration(
1632 device, feature.flags, arg->data,
1633 feature.argsz - minsz);
1634 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1635 return vfio_ioctl_device_feature_mig_device_state(
1636 device, feature.flags, arg->data,
1637 feature.argsz - minsz);
1639 if (unlikely(!device->ops->device_feature))
1641 return device->ops->device_feature(device, feature.flags,
1643 feature.argsz - minsz);
1647 static long vfio_device_fops_unl_ioctl(struct file *filep,
1648 unsigned int cmd, unsigned long arg)
1650 struct vfio_device *device = filep->private_data;
1653 case VFIO_DEVICE_FEATURE:
1654 return vfio_ioctl_device_feature(device, (void __user *)arg);
1656 if (unlikely(!device->ops->ioctl))
1658 return device->ops->ioctl(device, cmd, arg);
1662 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1663 size_t count, loff_t *ppos)
1665 struct vfio_device *device = filep->private_data;
1667 if (unlikely(!device->ops->read))
1670 return device->ops->read(device, buf, count, ppos);
1673 static ssize_t vfio_device_fops_write(struct file *filep,
1674 const char __user *buf,
1675 size_t count, loff_t *ppos)
1677 struct vfio_device *device = filep->private_data;
1679 if (unlikely(!device->ops->write))
1682 return device->ops->write(device, buf, count, ppos);
1685 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1687 struct vfio_device *device = filep->private_data;
1689 if (unlikely(!device->ops->mmap))
1692 return device->ops->mmap(device, vma);
1695 static const struct file_operations vfio_device_fops = {
1696 .owner = THIS_MODULE,
1697 .release = vfio_device_fops_release,
1698 .read = vfio_device_fops_read,
1699 .write = vfio_device_fops_write,
1700 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1701 .compat_ioctl = compat_ptr_ioctl,
1702 .mmap = vfio_device_fops_mmap,
1706 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1707 * @file: VFIO group file
1709 * The returned iommu_group is valid as long as a ref is held on the file.
1711 struct iommu_group *vfio_file_iommu_group(struct file *file)
1713 struct vfio_group *group = file->private_data;
1715 if (file->f_op != &vfio_group_fops)
1717 return group->iommu_group;
1719 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1722 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1723 * is always CPU cache coherent
1724 * @file: VFIO group file
1726 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1727 * bit in DMA transactions. A return of false indicates that the user has
1728 * rights to access additional instructions such as wbinvd on x86.
1730 bool vfio_file_enforced_coherent(struct file *file)
1732 struct vfio_group *group = file->private_data;
1735 if (file->f_op != &vfio_group_fops)
1738 down_read(&group->group_rwsem);
1739 if (group->container) {
1740 ret = vfio_ioctl_check_extension(group->container,
1744 * Since the coherency state is determined only once a container
1745 * is attached the user must do so before they can prove they
1750 up_read(&group->group_rwsem);
1753 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1756 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1757 * @file: VFIO group file
1760 * When a VFIO device is first opened the KVM will be available in
1761 * device->kvm if one was associated with the group.
1763 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1765 struct vfio_group *group = file->private_data;
1767 if (file->f_op != &vfio_group_fops)
1770 down_write(&group->group_rwsem);
1772 up_write(&group->group_rwsem);
1774 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1777 * vfio_file_has_dev - True if the VFIO file is a handle for device
1778 * @file: VFIO file to check
1779 * @device: Device that must be part of the file
1781 * Returns true if given file has permission to manipulate the given device.
1783 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1785 struct vfio_group *group = file->private_data;
1787 if (file->f_op != &vfio_group_fops)
1790 return group == device->group;
1792 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1795 * Sub-module support
1798 * Helper for managing a buffer of info chain capabilities, allocate or
1799 * reallocate a buffer with additional @size, filling in @id and @version
1800 * of the capability. A pointer to the new capability is returned.
1802 * NB. The chain is based at the head of the buffer, so new entries are
1803 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1804 * next offsets prior to copying to the user buffer.
1806 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1807 size_t size, u16 id, u16 version)
1810 struct vfio_info_cap_header *header, *tmp;
1812 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1816 return ERR_PTR(-ENOMEM);
1820 header = buf + caps->size;
1822 /* Eventually copied to user buffer, zero */
1823 memset(header, 0, size);
1826 header->version = version;
1828 /* Add to the end of the capability chain */
1829 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1832 tmp->next = caps->size;
1837 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1839 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1841 struct vfio_info_cap_header *tmp;
1842 void *buf = (void *)caps->buf;
1844 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1845 tmp->next += offset;
1847 EXPORT_SYMBOL(vfio_info_cap_shift);
1849 int vfio_info_add_capability(struct vfio_info_cap *caps,
1850 struct vfio_info_cap_header *cap, size_t size)
1852 struct vfio_info_cap_header *header;
1854 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1856 return PTR_ERR(header);
1858 memcpy(header + 1, cap + 1, size - sizeof(*header));
1862 EXPORT_SYMBOL(vfio_info_add_capability);
1864 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1865 int max_irq_type, size_t *data_size)
1867 unsigned long minsz;
1870 minsz = offsetofend(struct vfio_irq_set, count);
1872 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1873 (hdr->count >= (U32_MAX - hdr->start)) ||
1874 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1875 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1881 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1884 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1885 case VFIO_IRQ_SET_DATA_NONE:
1888 case VFIO_IRQ_SET_DATA_BOOL:
1889 size = sizeof(uint8_t);
1891 case VFIO_IRQ_SET_DATA_EVENTFD:
1892 size = sizeof(int32_t);
1899 if (hdr->argsz - minsz < hdr->count * size)
1905 *data_size = hdr->count * size;
1910 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1913 * Pin a set of guest PFNs and return their associated host PFNs for local
1915 * @device [in] : device
1916 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1917 * @npage [in] : count of elements in user_pfn array. This count should not
1918 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1919 * @prot [in] : protection flags
1920 * @phys_pfn[out]: array of host PFNs
1921 * Return error or number of pages pinned.
1923 int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
1924 int npage, int prot, unsigned long *phys_pfn)
1926 struct vfio_container *container;
1927 struct vfio_group *group = device->group;
1928 struct vfio_iommu_driver *driver;
1931 if (!user_pfn || !phys_pfn || !npage ||
1932 !vfio_assert_device_open(device))
1935 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1938 if (group->dev_counter > 1)
1941 /* group->container cannot change while a vfio device is open */
1942 container = group->container;
1943 driver = container->iommu_driver;
1944 if (likely(driver && driver->ops->pin_pages))
1945 ret = driver->ops->pin_pages(container->iommu_data,
1946 group->iommu_group, user_pfn,
1947 npage, prot, phys_pfn);
1953 EXPORT_SYMBOL(vfio_pin_pages);
1956 * Unpin set of host PFNs for local domain only.
1957 * @device [in] : device
1958 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1959 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1960 * @npage [in] : count of elements in user_pfn array. This count should not
1961 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1962 * Return error or number of pages unpinned.
1964 int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
1967 struct vfio_container *container;
1968 struct vfio_iommu_driver *driver;
1971 if (!user_pfn || !npage || !vfio_assert_device_open(device))
1974 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1977 /* group->container cannot change while a vfio device is open */
1978 container = device->group->container;
1979 driver = container->iommu_driver;
1980 if (likely(driver && driver->ops->unpin_pages))
1981 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1988 EXPORT_SYMBOL(vfio_unpin_pages);
1991 * This interface allows the CPUs to perform some sort of virtual DMA on
1992 * behalf of the device.
1994 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1995 * into/from a kernel buffer.
1997 * As the read/write of user space memory is conducted via the CPUs and is
1998 * not a real device DMA, it is not necessary to pin the user space memory.
2000 * @device [in] : VFIO device
2001 * @user_iova [in] : base IOVA of a user space buffer
2002 * @data [in] : pointer to kernel buffer
2003 * @len [in] : kernel buffer length
2004 * @write : indicate read or write
2005 * Return error code on failure or 0 on success.
2007 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
2008 size_t len, bool write)
2010 struct vfio_container *container;
2011 struct vfio_iommu_driver *driver;
2014 if (!data || len <= 0 || !vfio_assert_device_open(device))
2017 /* group->container cannot change while a vfio device is open */
2018 container = device->group->container;
2019 driver = container->iommu_driver;
2021 if (likely(driver && driver->ops->dma_rw))
2022 ret = driver->ops->dma_rw(container->iommu_data,
2023 user_iova, data, len, write);
2028 EXPORT_SYMBOL(vfio_dma_rw);
2030 static int vfio_register_iommu_notifier(struct vfio_group *group,
2031 unsigned long *events,
2032 struct notifier_block *nb)
2034 struct vfio_container *container;
2035 struct vfio_iommu_driver *driver;
2038 lockdep_assert_held_read(&group->group_rwsem);
2040 container = group->container;
2041 driver = container->iommu_driver;
2042 if (likely(driver && driver->ops->register_notifier))
2043 ret = driver->ops->register_notifier(container->iommu_data,
2051 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2052 struct notifier_block *nb)
2054 struct vfio_container *container;
2055 struct vfio_iommu_driver *driver;
2058 lockdep_assert_held_read(&group->group_rwsem);
2060 container = group->container;
2061 driver = container->iommu_driver;
2062 if (likely(driver && driver->ops->unregister_notifier))
2063 ret = driver->ops->unregister_notifier(container->iommu_data,
2071 int vfio_register_notifier(struct vfio_device *device,
2072 enum vfio_notify_type type, unsigned long *events,
2073 struct notifier_block *nb)
2075 struct vfio_group *group = device->group;
2078 if (!nb || !events || (*events == 0) ||
2079 !vfio_assert_device_open(device))
2083 case VFIO_IOMMU_NOTIFY:
2084 ret = vfio_register_iommu_notifier(group, events, nb);
2091 EXPORT_SYMBOL(vfio_register_notifier);
2093 int vfio_unregister_notifier(struct vfio_device *device,
2094 enum vfio_notify_type type,
2095 struct notifier_block *nb)
2097 struct vfio_group *group = device->group;
2100 if (!nb || !vfio_assert_device_open(device))
2104 case VFIO_IOMMU_NOTIFY:
2105 ret = vfio_unregister_iommu_notifier(group, nb);
2112 EXPORT_SYMBOL(vfio_unregister_notifier);
2115 * Module/class support
2117 static char *vfio_devnode(struct device *dev, umode_t *mode)
2119 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2122 static struct miscdevice vfio_dev = {
2123 .minor = VFIO_MINOR,
2126 .nodename = "vfio/vfio",
2127 .mode = S_IRUGO | S_IWUGO,
2130 static int __init vfio_init(void)
2134 ida_init(&vfio.group_ida);
2135 mutex_init(&vfio.group_lock);
2136 mutex_init(&vfio.iommu_drivers_lock);
2137 INIT_LIST_HEAD(&vfio.group_list);
2138 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2140 ret = misc_register(&vfio_dev);
2142 pr_err("vfio: misc device register failed\n");
2146 /* /dev/vfio/$GROUP */
2147 vfio.class = class_create(THIS_MODULE, "vfio");
2148 if (IS_ERR(vfio.class)) {
2149 ret = PTR_ERR(vfio.class);
2153 vfio.class->devnode = vfio_devnode;
2155 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2157 goto err_alloc_chrdev;
2159 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2161 #ifdef CONFIG_VFIO_NOIOMMU
2162 vfio_register_iommu_driver(&vfio_noiommu_ops);
2167 class_destroy(vfio.class);
2170 misc_deregister(&vfio_dev);
2174 static void __exit vfio_cleanup(void)
2176 WARN_ON(!list_empty(&vfio.group_list));
2178 #ifdef CONFIG_VFIO_NOIOMMU
2179 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2181 ida_destroy(&vfio.group_ida);
2182 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2183 class_destroy(vfio.class);
2185 misc_deregister(&vfio_dev);
2186 xa_destroy(&vfio_device_set_xa);
2189 module_init(vfio_init);
2190 module_exit(vfio_cleanup);
2192 MODULE_VERSION(DRIVER_VERSION);
2193 MODULE_LICENSE("GPL v2");
2194 MODULE_AUTHOR(DRIVER_AUTHOR);
2195 MODULE_DESCRIPTION(DRIVER_DESC);
2196 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2197 MODULE_ALIAS("devname:vfio/vfio");
2198 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");