drivers/vfio/vfio.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * VFIO core
   4  *
   5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6  *     Author: Alex Williamson <alex.williamson@redhat.com>
   7  *
   8  * Derived from original vfio:
   9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10  * Author: Tom Lyon, pugs@cisco.com
  11  */
  12
  13 #include <linux/cdev.h>
  14 #include <linux/compat.h>
  15 #include <linux/device.h>
  16 #include <linux/file.h>
  17 #include <linux/anon_inodes.h>
  18 #include <linux/fs.h>
  19 #include <linux/idr.h>
  20 #include <linux/iommu.h>
  21 #include <linux/list.h>
  22 #include <linux/miscdevice.h>
  23 #include <linux/module.h>
  24 #include <linux/mutex.h>
  25 #include <linux/pci.h>
  26 #include <linux/rwsem.h>
  27 #include <linux/sched.h>
  28 #include <linux/slab.h>
  29 #include <linux/stat.h>
  30 #include <linux/string.h>
  31 #include <linux/uaccess.h>
  32 #include <linux/vfio.h>
  33 #include <linux/wait.h>
  34 #include <linux/sched/signal.h>
  35
  36 #define DRIVER_VERSION  "0.3"
  37 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  38 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  39
  40 static struct vfio {
  41         struct class                    *class;
  42         struct list_head                iommu_drivers_list;
  43         struct mutex                    iommu_drivers_lock;
  44         struct list_head                group_list;
  45         struct idr                      group_idr;
  46         struct mutex                    group_lock;
  47         struct cdev                     group_cdev;
  48         dev_t                           group_devt;
  49 } vfio;
  50
  51 struct vfio_iommu_driver {
  52         const struct vfio_iommu_driver_ops      *ops;
  53         struct list_head                        vfio_next;
  54 };
  55
  56 struct vfio_container {
  57         struct kref                     kref;
  58         struct list_head                group_list;
  59         struct rw_semaphore             group_lock;
  60         struct vfio_iommu_driver        *iommu_driver;
  61         void                            *iommu_data;
  62         bool                            noiommu;
  63 };
  64
  65 struct vfio_unbound_dev {
  66         struct device                   *dev;
  67         struct list_head                unbound_next;
  68 };
  69
  70 struct vfio_group {
  71         struct kref                     kref;
  72         int                             minor;
  73         atomic_t                        container_users;
  74         struct iommu_group              *iommu_group;
  75         struct vfio_container           *container;
  76         struct list_head                device_list;
  77         struct mutex                    device_lock;
  78         struct device                   *dev;
  79         struct notifier_block           nb;
  80         struct list_head                vfio_next;
  81         struct list_head                container_next;
  82         struct list_head                unbound_list;
  83         struct mutex                    unbound_lock;
  84         atomic_t                        opened;
  85         wait_queue_head_t               container_q;
  86         bool                            noiommu;
  87         unsigned int                    dev_counter;
  88         struct kvm                      *kvm;
  89         struct blocking_notifier_head   notifier;
  90 };
  91
  92 #ifdef CONFIG_VFIO_NOIOMMU
  93 static bool noiommu __read_mostly;
  94 module_param_named(enable_unsafe_noiommu_mode,
  95                    noiommu, bool, S_IRUGO | S_IWUSR);
  96 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
  97 #endif
  98
  99 static DEFINE_XARRAY(vfio_device_set_xa);
 100
 101 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
 102 {
 103         unsigned long idx = (unsigned long)set_id;
 104         struct vfio_device_set *new_dev_set;
 105         struct vfio_device_set *dev_set;
 106
 107         if (WARN_ON(!set_id))
 108                 return -EINVAL;
 109
 110         /*
 111          * Atomically acquire a singleton object in the xarray for this set_id
 112          */
 113         xa_lock(&vfio_device_set_xa);
 114         dev_set = xa_load(&vfio_device_set_xa, idx);
 115         if (dev_set)
 116                 goto found_get_ref;
 117         xa_unlock(&vfio_device_set_xa);
 118
 119         new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
 120         if (!new_dev_set)
 121                 return -ENOMEM;
 122         mutex_init(&new_dev_set->lock);
 123         INIT_LIST_HEAD(&new_dev_set->device_list);
 124         new_dev_set->set_id = set_id;
 125
 126         xa_lock(&vfio_device_set_xa);
 127         dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
 128                                GFP_KERNEL);
 129         if (!dev_set) {
 130                 dev_set = new_dev_set;
 131                 goto found_get_ref;
 132         }
 133
 134         kfree(new_dev_set);
 135         if (xa_is_err(dev_set)) {
 136                 xa_unlock(&vfio_device_set_xa);
 137                 return xa_err(dev_set);
 138         }
 139
 140 found_get_ref:
 141         dev_set->device_count++;
 142         xa_unlock(&vfio_device_set_xa);
 143         mutex_lock(&dev_set->lock);
 144         device->dev_set = dev_set;
 145         list_add_tail(&device->dev_set_list, &dev_set->device_list);
 146         mutex_unlock(&dev_set->lock);
 147         return 0;
 148 }
 149 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
 150
 151 static void vfio_release_device_set(struct vfio_device *device)
 152 {
 153         struct vfio_device_set *dev_set = device->dev_set;
 154
 155         if (!dev_set)
 156                 return;
 157
 158         mutex_lock(&dev_set->lock);
 159         list_del(&device->dev_set_list);
 160         mutex_unlock(&dev_set->lock);
 161
 162         xa_lock(&vfio_device_set_xa);
 163         if (!--dev_set->device_count) {
 164                 __xa_erase(&vfio_device_set_xa,
 165                            (unsigned long)dev_set->set_id);
 166                 mutex_destroy(&dev_set->lock);
 167                 kfree(dev_set);
 168         }
 169         xa_unlock(&vfio_device_set_xa);
 170 }
 171
 172 /*
 173  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 174  * and remove functions, any use cases other than acquiring the first
 175  * reference for the purpose of calling vfio_register_group_dev() or removing
 176  * that symmetric reference after vfio_unregister_group_dev() should use the raw
 177  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 178  * removes the device from the dummy group and cannot be nested.
 179  */
 180 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 181 {
 182         struct iommu_group *group;
 183         int __maybe_unused ret;
 184
 185         group = iommu_group_get(dev);
 186
 187 #ifdef CONFIG_VFIO_NOIOMMU
 188         /*
 189          * With noiommu enabled, an IOMMU group will be created for a device
 190          * that doesn't already have one and doesn't have an iommu_ops on their
 191          * bus.  We set iommudata simply to be able to identify these groups
 192          * as special use and for reclamation later.
 193          */
 194         if (group || !noiommu || iommu_present(dev->bus))
 195                 return group;
 196
 197         group = iommu_group_alloc();
 198         if (IS_ERR(group))
 199                 return NULL;
 200
 201         iommu_group_set_name(group, "vfio-noiommu");
 202         iommu_group_set_iommudata(group, &noiommu, NULL);
 203         ret = iommu_group_add_device(group, dev);
 204         if (ret) {
 205                 iommu_group_put(group);
 206                 return NULL;
 207         }
 208
 209         /*
 210          * Where to taint?  At this point we've added an IOMMU group for a
 211          * device that is not backed by iommu_ops, therefore any iommu_
 212          * callback using iommu_ops can legitimately Oops.  So, while we may
 213          * be about to give a DMA capable device to a user without IOMMU
 214          * protection, which is clearly taint-worthy, let's go ahead and do
 215          * it here.
 216          */
 217         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 218         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 219 #endif
 220
 221         return group;
 222 }
 223 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 224
 225 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 226 {
 227 #ifdef CONFIG_VFIO_NOIOMMU
 228         if (iommu_group_get_iommudata(group) == &noiommu)
 229                 iommu_group_remove_device(dev);
 230 #endif
 231
 232         iommu_group_put(group);
 233 }
 234 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 235
 236 #ifdef CONFIG_VFIO_NOIOMMU
 237 static void *vfio_noiommu_open(unsigned long arg)
 238 {
 239         if (arg != VFIO_NOIOMMU_IOMMU)
 240                 return ERR_PTR(-EINVAL);
 241         if (!capable(CAP_SYS_RAWIO))
 242                 return ERR_PTR(-EPERM);
 243
 244         return NULL;
 245 }
 246
 247 static void vfio_noiommu_release(void *iommu_data)
 248 {
 249 }
 250
 251 static long vfio_noiommu_ioctl(void *iommu_data,
 252                                unsigned int cmd, unsigned long arg)
 253 {
 254         if (cmd == VFIO_CHECK_EXTENSION)
 255                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 256
 257         return -ENOTTY;
 258 }
 259
 260 static int vfio_noiommu_attach_group(void *iommu_data,
 261                                      struct iommu_group *iommu_group)
 262 {
 263         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 264 }
 265
 266 static void vfio_noiommu_detach_group(void *iommu_data,
 267                                       struct iommu_group *iommu_group)
 268 {
 269 }
 270
 271 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 272         .name = "vfio-noiommu",
 273         .owner = THIS_MODULE,
 274         .open = vfio_noiommu_open,
 275         .release = vfio_noiommu_release,
 276         .ioctl = vfio_noiommu_ioctl,
 277         .attach_group = vfio_noiommu_attach_group,
 278         .detach_group = vfio_noiommu_detach_group,
 279 };
 280 #endif
 281
 282
 283 /**
 284  * IOMMU driver registration
 285  */
 286 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 287 {
 288         struct vfio_iommu_driver *driver, *tmp;
 289
 290         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 291         if (!driver)
 292                 return -ENOMEM;
 293
 294         driver->ops = ops;
 295
 296         mutex_lock(&vfio.iommu_drivers_lock);
 297
 298         /* Check for duplicates */
 299         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 300                 if (tmp->ops == ops) {
 301                         mutex_unlock(&vfio.iommu_drivers_lock);
 302                         kfree(driver);
 303                         return -EINVAL;
 304                 }
 305         }
 306
 307         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 308
 309         mutex_unlock(&vfio.iommu_drivers_lock);
 310
 311         return 0;
 312 }
 313 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 314
 315 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 316 {
 317         struct vfio_iommu_driver *driver;
 318
 319         mutex_lock(&vfio.iommu_drivers_lock);
 320         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 321                 if (driver->ops == ops) {
 322                         list_del(&driver->vfio_next);
 323                         mutex_unlock(&vfio.iommu_drivers_lock);
 324                         kfree(driver);
 325                         return;
 326                 }
 327         }
 328         mutex_unlock(&vfio.iommu_drivers_lock);
 329 }
 330 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 331
 332 /**
 333  * Group minor allocation/free - both called with vfio.group_lock held
 334  */
 335 static int vfio_alloc_group_minor(struct vfio_group *group)
 336 {
 337         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 338 }
 339
 340 static void vfio_free_group_minor(int minor)
 341 {
 342         idr_remove(&vfio.group_idr, minor);
 343 }
 344
 345 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 346                                      unsigned long action, void *data);
 347 static void vfio_group_get(struct vfio_group *group);
 348
 349 /**
 350  * Container objects - containers are created when /dev/vfio/vfio is
 351  * opened, but their lifecycle extends until the last user is done, so
 352  * it's freed via kref.  Must support container/group/device being
 353  * closed in any order.
 354  */
 355 static void vfio_container_get(struct vfio_container *container)
 356 {
 357         kref_get(&container->kref);
 358 }
 359
 360 static void vfio_container_release(struct kref *kref)
 361 {
 362         struct vfio_container *container;
 363         container = container_of(kref, struct vfio_container, kref);
 364
 365         kfree(container);
 366 }
 367
 368 static void vfio_container_put(struct vfio_container *container)
 369 {
 370         kref_put(&container->kref, vfio_container_release);
 371 }
 372
 373 static void vfio_group_unlock_and_free(struct vfio_group *group)
 374 {
 375         mutex_unlock(&vfio.group_lock);
 376         /*
 377          * Unregister outside of lock.  A spurious callback is harmless now
 378          * that the group is no longer in vfio.group_list.
 379          */
 380         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 381         kfree(group);
 382 }
 383
 384 /**
 385  * Group objects - create, release, get, put, search
 386  */
 387 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 388 {
 389         struct vfio_group *group, *tmp;
 390         struct device *dev;
 391         int ret, minor;
 392
 393         group = kzalloc(sizeof(*group), GFP_KERNEL);
 394         if (!group)
 395                 return ERR_PTR(-ENOMEM);
 396
 397         kref_init(&group->kref);
 398         INIT_LIST_HEAD(&group->device_list);
 399         mutex_init(&group->device_lock);
 400         INIT_LIST_HEAD(&group->unbound_list);
 401         mutex_init(&group->unbound_lock);
 402         atomic_set(&group->container_users, 0);
 403         atomic_set(&group->opened, 0);
 404         init_waitqueue_head(&group->container_q);
 405         group->iommu_group = iommu_group;
 406 #ifdef CONFIG_VFIO_NOIOMMU
 407         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 408 #endif
 409         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 410
 411         group->nb.notifier_call = vfio_iommu_group_notifier;
 412
 413         /*
 414          * blocking notifiers acquire a rwsem around registering and hold
 415          * it around callback.  Therefore, need to register outside of
 416          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 417          * do anything unless it can find the group in vfio.group_list, so
 418          * no harm in registering early.
 419          */
 420         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 421         if (ret) {
 422                 kfree(group);
 423                 return ERR_PTR(ret);
 424         }
 425
 426         mutex_lock(&vfio.group_lock);
 427
 428         /* Did we race creating this group? */
 429         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 430                 if (tmp->iommu_group == iommu_group) {
 431                         vfio_group_get(tmp);
 432                         vfio_group_unlock_and_free(group);
 433                         return tmp;
 434                 }
 435         }
 436
 437         minor = vfio_alloc_group_minor(group);
 438         if (minor < 0) {
 439                 vfio_group_unlock_and_free(group);
 440                 return ERR_PTR(minor);
 441         }
 442
 443         dev = device_create(vfio.class, NULL,
 444                             MKDEV(MAJOR(vfio.group_devt), minor),
 445                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 446                             iommu_group_id(iommu_group));
 447         if (IS_ERR(dev)) {
 448                 vfio_free_group_minor(minor);
 449                 vfio_group_unlock_and_free(group);
 450                 return ERR_CAST(dev);
 451         }
 452
 453         group->minor = minor;
 454         group->dev = dev;
 455
 456         list_add(&group->vfio_next, &vfio.group_list);
 457
 458         mutex_unlock(&vfio.group_lock);
 459
 460         return group;
 461 }
 462
 463 /* called with vfio.group_lock held */
 464 static void vfio_group_release(struct kref *kref)
 465 {
 466         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 467         struct vfio_unbound_dev *unbound, *tmp;
 468         struct iommu_group *iommu_group = group->iommu_group;
 469
 470         WARN_ON(!list_empty(&group->device_list));
 471         WARN_ON(group->notifier.head);
 472
 473         list_for_each_entry_safe(unbound, tmp,
 474                                  &group->unbound_list, unbound_next) {
 475                 list_del(&unbound->unbound_next);
 476                 kfree(unbound);
 477         }
 478
 479         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 480         list_del(&group->vfio_next);
 481         vfio_free_group_minor(group->minor);
 482         vfio_group_unlock_and_free(group);
 483         iommu_group_put(iommu_group);
 484 }
 485
 486 static void vfio_group_put(struct vfio_group *group)
 487 {
 488         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 489 }
 490
 491 struct vfio_group_put_work {
 492         struct work_struct work;
 493         struct vfio_group *group;
 494 };
 495
 496 static void vfio_group_put_bg(struct work_struct *work)
 497 {
 498         struct vfio_group_put_work *do_work;
 499
 500         do_work = container_of(work, struct vfio_group_put_work, work);
 501
 502         vfio_group_put(do_work->group);
 503         kfree(do_work);
 504 }
 505
 506 static void vfio_group_schedule_put(struct vfio_group *group)
 507 {
 508         struct vfio_group_put_work *do_work;
 509
 510         do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 511         if (WARN_ON(!do_work))
 512                 return;
 513
 514         INIT_WORK(&do_work->work, vfio_group_put_bg);
 515         do_work->group = group;
 516         schedule_work(&do_work->work);
 517 }
 518
 519 /* Assume group_lock or group reference is held */
 520 static void vfio_group_get(struct vfio_group *group)
 521 {
 522         kref_get(&group->kref);
 523 }
 524
 525 /*
 526  * Not really a try as we will sleep for mutex, but we need to make
 527  * sure the group pointer is valid under lock and get a reference.
 528  */
 529 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 530 {
 531         struct vfio_group *target = group;
 532
 533         mutex_lock(&vfio.group_lock);
 534         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 535                 if (group == target) {
 536                         vfio_group_get(group);
 537                         mutex_unlock(&vfio.group_lock);
 538                         return group;
 539                 }
 540         }
 541         mutex_unlock(&vfio.group_lock);
 542
 543         return NULL;
 544 }
 545
 546 static
 547 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 548 {
 549         struct vfio_group *group;
 550
 551         mutex_lock(&vfio.group_lock);
 552         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 553                 if (group->iommu_group == iommu_group) {
 554                         vfio_group_get(group);
 555                         mutex_unlock(&vfio.group_lock);
 556                         return group;
 557                 }
 558         }
 559         mutex_unlock(&vfio.group_lock);
 560
 561         return NULL;
 562 }
 563
 564 static struct vfio_group *vfio_group_get_from_minor(int minor)
 565 {
 566         struct vfio_group *group;
 567
 568         mutex_lock(&vfio.group_lock);
 569         group = idr_find(&vfio.group_idr, minor);
 570         if (!group) {
 571                 mutex_unlock(&vfio.group_lock);
 572                 return NULL;
 573         }
 574         vfio_group_get(group);
 575         mutex_unlock(&vfio.group_lock);
 576
 577         return group;
 578 }
 579
 580 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 581 {
 582         struct iommu_group *iommu_group;
 583         struct vfio_group *group;
 584
 585         iommu_group = iommu_group_get(dev);
 586         if (!iommu_group)
 587                 return NULL;
 588
 589         group = vfio_group_get_from_iommu(iommu_group);
 590         iommu_group_put(iommu_group);
 591
 592         return group;
 593 }
 594
 595 /**
 596  * Device objects - create, release, get, put, search
 597  */
 598 /* Device reference always implies a group reference */
 599 void vfio_device_put(struct vfio_device *device)
 600 {
 601         if (refcount_dec_and_test(&device->refcount))
 602                 complete(&device->comp);
 603 }
 604 EXPORT_SYMBOL_GPL(vfio_device_put);
 605
 606 static bool vfio_device_try_get(struct vfio_device *device)
 607 {
 608         return refcount_inc_not_zero(&device->refcount);
 609 }
 610
 611 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 612                                                  struct device *dev)
 613 {
 614         struct vfio_device *device;
 615
 616         mutex_lock(&group->device_lock);
 617         list_for_each_entry(device, &group->device_list, group_next) {
 618                 if (device->dev == dev && vfio_device_try_get(device)) {
 619                         mutex_unlock(&group->device_lock);
 620                         return device;
 621                 }
 622         }
 623         mutex_unlock(&group->device_lock);
 624         return NULL;
 625 }
 626
 627 /*
 628  * Some drivers, like pci-stub, are only used to prevent other drivers from
 629  * claiming a device and are therefore perfectly legitimate for a user owned
 630  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 631  * of the device, but it does prevent the user from having direct access to
 632  * the device, which is useful in some circumstances.
 633  *
 634  * We also assume that we can include PCI interconnect devices, ie. bridges.
 635  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 636  * then all of the downstream devices will be part of the same IOMMU group as
 637  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 638  * breaks anything, it only does so for user owned devices downstream.  Note
 639  * that error notification via MSI can be affected for platforms that handle
 640  * MSI within the same IOVA space as DMA.
 641  */
 642 static const char * const vfio_driver_allowed[] = { "pci-stub" };
 643
 644 static bool vfio_dev_driver_allowed(struct device *dev,
 645                                     struct device_driver *drv)
 646 {
 647         if (dev_is_pci(dev)) {
 648                 struct pci_dev *pdev = to_pci_dev(dev);
 649
 650                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 651                         return true;
 652         }
 653
 654         return match_string(vfio_driver_allowed,
 655                             ARRAY_SIZE(vfio_driver_allowed),
 656                             drv->name) >= 0;
 657 }
 658
 659 /*
 660  * A vfio group is viable for use by userspace if all devices are in
 661  * one of the following states:
 662  *  - driver-less
 663  *  - bound to a vfio driver
 664  *  - bound to an otherwise allowed driver
 665  *  - a PCI interconnect device
 666  *
 667  * We use two methods to determine whether a device is bound to a vfio
 668  * driver.  The first is to test whether the device exists in the vfio
 669  * group.  The second is to test if the device exists on the group
 670  * unbound_list, indicating it's in the middle of transitioning from
 671  * a vfio driver to driver-less.
 672  */
 673 static int vfio_dev_viable(struct device *dev, void *data)
 674 {
 675         struct vfio_group *group = data;
 676         struct vfio_device *device;
 677         struct device_driver *drv = READ_ONCE(dev->driver);
 678         struct vfio_unbound_dev *unbound;
 679         int ret = -EINVAL;
 680
 681         mutex_lock(&group->unbound_lock);
 682         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 683                 if (dev == unbound->dev) {
 684                         ret = 0;
 685                         break;
 686                 }
 687         }
 688         mutex_unlock(&group->unbound_lock);
 689
 690         if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
 691                 return 0;
 692
 693         device = vfio_group_get_device(group, dev);
 694         if (device) {
 695                 vfio_device_put(device);
 696                 return 0;
 697         }
 698
 699         return ret;
 700 }
 701
 702 /**
 703  * Async device support
 704  */
 705 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 706 {
 707         struct vfio_device *device;
 708
 709         /* Do we already know about it?  We shouldn't */
 710         device = vfio_group_get_device(group, dev);
 711         if (WARN_ON_ONCE(device)) {
 712                 vfio_device_put(device);
 713                 return 0;
 714         }
 715
 716         /* Nothing to do for idle groups */
 717         if (!atomic_read(&group->container_users))
 718                 return 0;
 719
 720         /* TODO Prevent device auto probing */
 721         dev_WARN(dev, "Device added to live group %d!\n",
 722                  iommu_group_id(group->iommu_group));
 723
 724         return 0;
 725 }
 726
 727 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 728 {
 729         /* We don't care what happens when the group isn't in use */
 730         if (!atomic_read(&group->container_users))
 731                 return 0;
 732
 733         return vfio_dev_viable(dev, group);
 734 }
 735
 736 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 737                                      unsigned long action, void *data)
 738 {
 739         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 740         struct device *dev = data;
 741         struct vfio_unbound_dev *unbound;
 742
 743         /*
 744          * Need to go through a group_lock lookup to get a reference or we
 745          * risk racing a group being removed.  Ignore spurious notifies.
 746          */
 747         group = vfio_group_try_get(group);
 748         if (!group)
 749                 return NOTIFY_OK;
 750
 751         switch (action) {
 752         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 753                 vfio_group_nb_add_dev(group, dev);
 754                 break;
 755         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 756                 /*
 757                  * Nothing to do here.  If the device is in use, then the
 758                  * vfio sub-driver should block the remove callback until
 759                  * it is unused.  If the device is unused or attached to a
 760                  * stub driver, then it should be released and we don't
 761                  * care that it will be going away.
 762                  */
 763                 break;
 764         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 765                 dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 766                         iommu_group_id(group->iommu_group));
 767                 break;
 768         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 769                 dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 770                         iommu_group_id(group->iommu_group), dev->driver->name);
 771                 BUG_ON(vfio_group_nb_verify(group, dev));
 772                 break;
 773         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 774                 dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 775                         __func__, iommu_group_id(group->iommu_group),
 776                         dev->driver->name);
 777                 break;
 778         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 779                 dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 780                         iommu_group_id(group->iommu_group));
 781                 /*
 782                  * XXX An unbound device in a live group is ok, but we'd
 783                  * really like to avoid the above BUG_ON by preventing other
 784                  * drivers from binding to it.  Once that occurs, we have to
 785                  * stop the system to maintain isolation.  At a minimum, we'd
 786                  * want a toggle to disable driver auto probe for this device.
 787                  */
 788
 789                 mutex_lock(&group->unbound_lock);
 790                 list_for_each_entry(unbound,
 791                                     &group->unbound_list, unbound_next) {
 792                         if (dev == unbound->dev) {
 793                                 list_del(&unbound->unbound_next);
 794                                 kfree(unbound);
 795                                 break;
 796                         }
 797                 }
 798                 mutex_unlock(&group->unbound_lock);
 799                 break;
 800         }
 801
 802         /*
 803          * If we're the last reference to the group, the group will be
 804          * released, which includes unregistering the iommu group notifier.
 805          * We hold a read-lock on that notifier list, unregistering needs
 806          * a write-lock... deadlock.  Release our reference asynchronously
 807          * to avoid that situation.
 808          */
 809         vfio_group_schedule_put(group);
 810         return NOTIFY_OK;
 811 }
 812
 813 /**
 814  * VFIO driver API
 815  */
 816 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 817                          const struct vfio_device_ops *ops)
 818 {
 819         init_completion(&device->comp);
 820         device->dev = dev;
 821         device->ops = ops;
 822 }
 823 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
 824
 825 void vfio_uninit_group_dev(struct vfio_device *device)
 826 {
 827         vfio_release_device_set(device);
 828 }
 829 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
 830
 831 int vfio_register_group_dev(struct vfio_device *device)
 832 {
 833         struct vfio_device *existing_device;
 834         struct iommu_group *iommu_group;
 835         struct vfio_group *group;
 836
 837         /*
 838          * If the driver doesn't specify a set then the device is added to a
 839          * singleton set just for itself.
 840          */
 841         if (!device->dev_set)
 842                 vfio_assign_device_set(device, device);
 843
 844         iommu_group = iommu_group_get(device->dev);
 845         if (!iommu_group)
 846                 return -EINVAL;
 847
 848         group = vfio_group_get_from_iommu(iommu_group);
 849         if (!group) {
 850                 group = vfio_create_group(iommu_group);
 851                 if (IS_ERR(group)) {
 852                         iommu_group_put(iommu_group);
 853                         return PTR_ERR(group);
 854                 }
 855         } else {
 856                 /*
 857                  * A found vfio_group already holds a reference to the
 858                  * iommu_group.  A created vfio_group keeps the reference.
 859                  */
 860                 iommu_group_put(iommu_group);
 861         }
 862
 863         existing_device = vfio_group_get_device(group, device->dev);
 864         if (existing_device) {
 865                 dev_WARN(device->dev, "Device already exists on group %d\n",
 866                          iommu_group_id(iommu_group));
 867                 vfio_device_put(existing_device);
 868                 vfio_group_put(group);
 869                 return -EBUSY;
 870         }
 871
 872         /* Our reference on group is moved to the device */
 873         device->group = group;
 874
 875         /* Refcounting can't start until the driver calls register */
 876         refcount_set(&device->refcount, 1);
 877
 878         mutex_lock(&group->device_lock);
 879         list_add(&device->group_next, &group->device_list);
 880         group->dev_counter++;
 881         mutex_unlock(&group->device_lock);
 882
 883         return 0;
 884 }
 885 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 886
 887 /**
 888  * Get a reference to the vfio_device for a device.  Even if the
 889  * caller thinks they own the device, they could be racing with a
 890  * release call path, so we can't trust drvdata for the shortcut.
 891  * Go the long way around, from the iommu_group to the vfio_group
 892  * to the vfio_device.
 893  */
 894 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 895 {
 896         struct vfio_group *group;
 897         struct vfio_device *device;
 898
 899         group = vfio_group_get_from_dev(dev);
 900         if (!group)
 901                 return NULL;
 902
 903         device = vfio_group_get_device(group, dev);
 904         vfio_group_put(group);
 905
 906         return device;
 907 }
 908 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 909
 910 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 911                                                      char *buf)
 912 {
 913         struct vfio_device *it, *device = ERR_PTR(-ENODEV);
 914
 915         mutex_lock(&group->device_lock);
 916         list_for_each_entry(it, &group->device_list, group_next) {
 917                 int ret;
 918
 919                 if (it->ops->match) {
 920                         ret = it->ops->match(it, buf);
 921                         if (ret < 0) {
 922                                 device = ERR_PTR(ret);
 923                                 break;
 924                         }
 925                 } else {
 926                         ret = !strcmp(dev_name(it->dev), buf);
 927                 }
 928
 929                 if (ret && vfio_device_try_get(it)) {
 930                         device = it;
 931                         break;
 932                 }
 933         }
 934         mutex_unlock(&group->device_lock);
 935
 936         return device;
 937 }
 938
 939 /*
 940  * Decrement the device reference count and wait for the device to be
 941  * removed.  Open file descriptors for the device... */
 942 void vfio_unregister_group_dev(struct vfio_device *device)
 943 {
 944         struct vfio_group *group = device->group;
 945         struct vfio_unbound_dev *unbound;
 946         unsigned int i = 0;
 947         bool interrupted = false;
 948         long rc;
 949
 950         /*
 951          * When the device is removed from the group, the group suddenly
 952          * becomes non-viable; the device has a driver (until the unbind
 953          * completes), but it's not present in the group.  This is bad news
 954          * for any external users that need to re-acquire a group reference
 955          * in order to match and release their existing reference.  To
 956          * solve this, we track such devices on the unbound_list to bridge
 957          * the gap until they're fully unbound.
 958          */
 959         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 960         if (unbound) {
 961                 unbound->dev = device->dev;
 962                 mutex_lock(&group->unbound_lock);
 963                 list_add(&unbound->unbound_next, &group->unbound_list);
 964                 mutex_unlock(&group->unbound_lock);
 965         }
 966         WARN_ON(!unbound);
 967
 968         vfio_device_put(device);
 969         rc = try_wait_for_completion(&device->comp);
 970         while (rc <= 0) {
 971                 if (device->ops->request)
 972                         device->ops->request(device, i++);
 973
 974                 if (interrupted) {
 975                         rc = wait_for_completion_timeout(&device->comp,
 976                                                          HZ * 10);
 977                 } else {
 978                         rc = wait_for_completion_interruptible_timeout(
 979                                 &device->comp, HZ * 10);
 980                         if (rc < 0) {
 981                                 interrupted = true;
 982                                 dev_warn(device->dev,
 983                                          "Device is currently in use, task"
 984                                          " \"%s\" (%d) "
 985                                          "blocked until device is released",
 986                                          current->comm, task_pid_nr(current));
 987                         }
 988                 }
 989         }
 990
 991         mutex_lock(&group->device_lock);
 992         list_del(&device->group_next);
 993         group->dev_counter--;
 994         mutex_unlock(&group->device_lock);
 995
 996         /*
 997          * In order to support multiple devices per group, devices can be
 998          * plucked from the group while other devices in the group are still
 999          * in use.  The container persists with this group and those remaining
1000          * devices still attached.  If the user creates an isolation violation
1001          * by binding this device to another driver while the group is still in
1002          * use, that's their fault.  However, in the case of removing the last,
1003          * or potentially the only, device in the group there can be no other
1004          * in-use devices in the group.  The user has done their due diligence
1005          * and we should lay no claims to those devices.  In order to do that,
1006          * we need to make sure the group is detached from the container.
1007          * Without this stall, we're potentially racing with a user process
1008          * that may attempt to immediately bind this device to another driver.
1009          */
1010         if (list_empty(&group->device_list))
1011                 wait_event(group->container_q, !group->container);
1012
1013         /* Matches the get in vfio_register_group_dev() */
1014         vfio_group_put(group);
1015 }
1016 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
1017
1018 /**
1019  * VFIO base fd, /dev/vfio/vfio
1020  */
1021 static long vfio_ioctl_check_extension(struct vfio_container *container,
1022                                        unsigned long arg)
1023 {
1024         struct vfio_iommu_driver *driver;
1025         long ret = 0;
1026
1027         down_read(&container->group_lock);
1028
1029         driver = container->iommu_driver;
1030
1031         switch (arg) {
1032                 /* No base extensions yet */
1033         default:
1034                 /*
1035                  * If no driver is set, poll all registered drivers for
1036                  * extensions and return the first positive result.  If
1037                  * a driver is already set, further queries will be passed
1038                  * only to that driver.
1039                  */
1040                 if (!driver) {
1041                         mutex_lock(&vfio.iommu_drivers_lock);
1042                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
1043                                             vfio_next) {
1044
1045 #ifdef CONFIG_VFIO_NOIOMMU
1046                                 if (!list_empty(&container->group_list) &&
1047                                     (container->noiommu !=
1048                                      (driver->ops == &vfio_noiommu_ops)))
1049                                         continue;
1050 #endif
1051
1052                                 if (!try_module_get(driver->ops->owner))
1053                                         continue;
1054
1055                                 ret = driver->ops->ioctl(NULL,
1056                                                          VFIO_CHECK_EXTENSION,
1057                                                          arg);
1058                                 module_put(driver->ops->owner);
1059                                 if (ret > 0)
1060                                         break;
1061                         }
1062                         mutex_unlock(&vfio.iommu_drivers_lock);
1063                 } else
1064                         ret = driver->ops->ioctl(container->iommu_data,
1065                                                  VFIO_CHECK_EXTENSION, arg);
1066         }
1067
1068         up_read(&container->group_lock);
1069
1070         return ret;
1071 }
1072
1073 /* hold write lock on container->group_lock */
1074 static int __vfio_container_attach_groups(struct vfio_container *container,
1075                                           struct vfio_iommu_driver *driver,
1076                                           void *data)
1077 {
1078         struct vfio_group *group;
1079         int ret = -ENODEV;
1080
1081         list_for_each_entry(group, &container->group_list, container_next) {
1082                 ret = driver->ops->attach_group(data, group->iommu_group);
1083                 if (ret)
1084                         goto unwind;
1085         }
1086
1087         return ret;
1088
1089 unwind:
1090         list_for_each_entry_continue_reverse(group, &container->group_list,
1091                                              container_next) {
1092                 driver->ops->detach_group(data, group->iommu_group);
1093         }
1094
1095         return ret;
1096 }
1097
1098 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1099                                  unsigned long arg)
1100 {
1101         struct vfio_iommu_driver *driver;
1102         long ret = -ENODEV;
1103
1104         down_write(&container->group_lock);
1105
1106         /*
1107          * The container is designed to be an unprivileged interface while
1108          * the group can be assigned to specific users.  Therefore, only by
1109          * adding a group to a container does the user get the privilege of
1110          * enabling the iommu, which may allocate finite resources.  There
1111          * is no unset_iommu, but by removing all the groups from a container,
1112          * the container is deprivileged and returns to an unset state.
1113          */
1114         if (list_empty(&container->group_list) || container->iommu_driver) {
1115                 up_write(&container->group_lock);
1116                 return -EINVAL;
1117         }
1118
1119         mutex_lock(&vfio.iommu_drivers_lock);
1120         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1121                 void *data;
1122
1123 #ifdef CONFIG_VFIO_NOIOMMU
1124                 /*
1125                  * Only noiommu containers can use vfio-noiommu and noiommu
1126                  * containers can only use vfio-noiommu.
1127                  */
1128                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1129                         continue;
1130 #endif
1131
1132                 if (!try_module_get(driver->ops->owner))
1133                         continue;
1134
1135                 /*
1136                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1137                  * so test which iommu driver reported support for this
1138                  * extension and call open on them.  We also pass them the
1139                  * magic, allowing a single driver to support multiple
1140                  * interfaces if they'd like.
1141                  */
1142                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1143                         module_put(driver->ops->owner);
1144                         continue;
1145                 }
1146
1147                 data = driver->ops->open(arg);
1148                 if (IS_ERR(data)) {
1149                         ret = PTR_ERR(data);
1150                         module_put(driver->ops->owner);
1151                         continue;
1152                 }
1153
1154                 ret = __vfio_container_attach_groups(container, driver, data);
1155                 if (ret) {
1156                         driver->ops->release(data);
1157                         module_put(driver->ops->owner);
1158                         continue;
1159                 }
1160
1161                 container->iommu_driver = driver;
1162                 container->iommu_data = data;
1163                 break;
1164         }
1165
1166         mutex_unlock(&vfio.iommu_drivers_lock);
1167         up_write(&container->group_lock);
1168
1169         return ret;
1170 }
1171
1172 static long vfio_fops_unl_ioctl(struct file *filep,
1173                                 unsigned int cmd, unsigned long arg)
1174 {
1175         struct vfio_container *container = filep->private_data;
1176         struct vfio_iommu_driver *driver;
1177         void *data;
1178         long ret = -EINVAL;
1179
1180         if (!container)
1181                 return ret;
1182
1183         switch (cmd) {
1184         case VFIO_GET_API_VERSION:
1185                 ret = VFIO_API_VERSION;
1186                 break;
1187         case VFIO_CHECK_EXTENSION:
1188                 ret = vfio_ioctl_check_extension(container, arg);
1189                 break;
1190         case VFIO_SET_IOMMU:
1191                 ret = vfio_ioctl_set_iommu(container, arg);
1192                 break;
1193         default:
1194                 driver = container->iommu_driver;
1195                 data = container->iommu_data;
1196
1197                 if (driver) /* passthrough all unrecognized ioctls */
1198                         ret = driver->ops->ioctl(data, cmd, arg);
1199         }
1200
1201         return ret;
1202 }
1203
1204 static int vfio_fops_open(struct inode *inode, struct file *filep)
1205 {
1206         struct vfio_container *container;
1207
1208         container = kzalloc(sizeof(*container), GFP_KERNEL);
1209         if (!container)
1210                 return -ENOMEM;
1211
1212         INIT_LIST_HEAD(&container->group_list);
1213         init_rwsem(&container->group_lock);
1214         kref_init(&container->kref);
1215
1216         filep->private_data = container;
1217
1218         return 0;
1219 }
1220
1221 static int vfio_fops_release(struct inode *inode, struct file *filep)
1222 {
1223         struct vfio_container *container = filep->private_data;
1224         struct vfio_iommu_driver *driver = container->iommu_driver;
1225
1226         if (driver && driver->ops->notify)
1227                 driver->ops->notify(container->iommu_data,
1228                                     VFIO_IOMMU_CONTAINER_CLOSE);
1229
1230         filep->private_data = NULL;
1231
1232         vfio_container_put(container);
1233
1234         return 0;
1235 }
1236
1237 /*
1238  * Once an iommu driver is set, we optionally pass read/write/mmap
1239  * on to the driver, allowing management interfaces beyond ioctl.
1240  */
1241 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1242                               size_t count, loff_t *ppos)
1243 {
1244         struct vfio_container *container = filep->private_data;
1245         struct vfio_iommu_driver *driver;
1246         ssize_t ret = -EINVAL;
1247
1248         driver = container->iommu_driver;
1249         if (likely(driver && driver->ops->read))
1250                 ret = driver->ops->read(container->iommu_data,
1251                                         buf, count, ppos);
1252
1253         return ret;
1254 }
1255
1256 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1257                                size_t count, loff_t *ppos)
1258 {
1259         struct vfio_container *container = filep->private_data;
1260         struct vfio_iommu_driver *driver;
1261         ssize_t ret = -EINVAL;
1262
1263         driver = container->iommu_driver;
1264         if (likely(driver && driver->ops->write))
1265                 ret = driver->ops->write(container->iommu_data,
1266                                          buf, count, ppos);
1267
1268         return ret;
1269 }
1270
1271 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1272 {
1273         struct vfio_container *container = filep->private_data;
1274         struct vfio_iommu_driver *driver;
1275         int ret = -EINVAL;
1276
1277         driver = container->iommu_driver;
1278         if (likely(driver && driver->ops->mmap))
1279                 ret = driver->ops->mmap(container->iommu_data, vma);
1280
1281         return ret;
1282 }
1283
1284 static const struct file_operations vfio_fops = {
1285         .owner          = THIS_MODULE,
1286         .open           = vfio_fops_open,
1287         .release        = vfio_fops_release,
1288         .read           = vfio_fops_read,
1289         .write          = vfio_fops_write,
1290         .unlocked_ioctl = vfio_fops_unl_ioctl,
1291         .compat_ioctl   = compat_ptr_ioctl,
1292         .mmap           = vfio_fops_mmap,
1293 };
1294
1295 /**
1296  * VFIO Group fd, /dev/vfio/$GROUP
1297  */
1298 static void __vfio_group_unset_container(struct vfio_group *group)
1299 {
1300         struct vfio_container *container = group->container;
1301         struct vfio_iommu_driver *driver;
1302
1303         down_write(&container->group_lock);
1304
1305         driver = container->iommu_driver;
1306         if (driver)
1307                 driver->ops->detach_group(container->iommu_data,
1308                                           group->iommu_group);
1309
1310         group->container = NULL;
1311         wake_up(&group->container_q);
1312         list_del(&group->container_next);
1313
1314         /* Detaching the last group deprivileges a container, remove iommu */
1315         if (driver && list_empty(&container->group_list)) {
1316                 driver->ops->release(container->iommu_data);
1317                 module_put(driver->ops->owner);
1318                 container->iommu_driver = NULL;
1319                 container->iommu_data = NULL;
1320         }
1321
1322         up_write(&container->group_lock);
1323
1324         vfio_container_put(container);
1325 }
1326
1327 /*
1328  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1329  * if there was no container to unset.  Since the ioctl is called on
1330  * the group, we know that still exists, therefore the only valid
1331  * transition here is 1->0.
1332  */
1333 static int vfio_group_unset_container(struct vfio_group *group)
1334 {
1335         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1336
1337         if (!users)
1338                 return -EINVAL;
1339         if (users != 1)
1340                 return -EBUSY;
1341
1342         __vfio_group_unset_container(group);
1343
1344         return 0;
1345 }
1346
1347 /*
1348  * When removing container users, anything that removes the last user
1349  * implicitly removes the group from the container.  That is, if the
1350  * group file descriptor is closed, as well as any device file descriptors,
1351  * the group is free.
1352  */
1353 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1354 {
1355         if (0 == atomic_dec_if_positive(&group->container_users))
1356                 __vfio_group_unset_container(group);
1357 }
1358
1359 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1360 {
1361         struct fd f;
1362         struct vfio_container *container;
1363         struct vfio_iommu_driver *driver;
1364         int ret = 0;
1365
1366         if (atomic_read(&group->container_users))
1367                 return -EINVAL;
1368
1369         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1370                 return -EPERM;
1371
1372         f = fdget(container_fd);
1373         if (!f.file)
1374                 return -EBADF;
1375
1376         /* Sanity check, is this really our fd? */
1377         if (f.file->f_op != &vfio_fops) {
1378                 fdput(f);
1379                 return -EINVAL;
1380         }
1381
1382         container = f.file->private_data;
1383         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1384
1385         down_write(&container->group_lock);
1386
1387         /* Real groups and fake groups cannot mix */
1388         if (!list_empty(&container->group_list) &&
1389             container->noiommu != group->noiommu) {
1390                 ret = -EPERM;
1391                 goto unlock_out;
1392         }
1393
1394         driver = container->iommu_driver;
1395         if (driver) {
1396                 ret = driver->ops->attach_group(container->iommu_data,
1397                                                 group->iommu_group);
1398                 if (ret)
1399                         goto unlock_out;
1400         }
1401
1402         group->container = container;
1403         container->noiommu = group->noiommu;
1404         list_add(&group->container_next, &container->group_list);
1405
1406         /* Get a reference on the container and mark a user within the group */
1407         vfio_container_get(container);
1408         atomic_inc(&group->container_users);
1409
1410 unlock_out:
1411         up_write(&container->group_lock);
1412         fdput(f);
1413         return ret;
1414 }
1415
1416 static bool vfio_group_viable(struct vfio_group *group)
1417 {
1418         return (iommu_group_for_each_dev(group->iommu_group,
1419                                          group, vfio_dev_viable) == 0);
1420 }
1421
1422 static int vfio_group_add_container_user(struct vfio_group *group)
1423 {
1424         if (!atomic_inc_not_zero(&group->container_users))
1425                 return -EINVAL;
1426
1427         if (group->noiommu) {
1428                 atomic_dec(&group->container_users);
1429                 return -EPERM;
1430         }
1431         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1432                 atomic_dec(&group->container_users);
1433                 return -EINVAL;
1434         }
1435
1436         return 0;
1437 }
1438
1439 static const struct file_operations vfio_device_fops;
1440
1441 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1442 {
1443         struct vfio_device *device;
1444         struct file *filep;
1445         int fdno;
1446         int ret = 0;
1447
1448         if (0 == atomic_read(&group->container_users) ||
1449             !group->container->iommu_driver || !vfio_group_viable(group))
1450                 return -EINVAL;
1451
1452         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1453                 return -EPERM;
1454
1455         device = vfio_device_get_from_name(group, buf);
1456         if (IS_ERR(device))
1457                 return PTR_ERR(device);
1458
1459         if (!try_module_get(device->dev->driver->owner)) {
1460                 ret = -ENODEV;
1461                 goto err_device_put;
1462         }
1463
1464         mutex_lock(&device->dev_set->lock);
1465         device->open_count++;
1466         if (device->open_count == 1 && device->ops->open_device) {
1467                 ret = device->ops->open_device(device);
1468                 if (ret)
1469                         goto err_undo_count;
1470         }
1471         mutex_unlock(&device->dev_set->lock);
1472
1473         /*
1474          * We can't use anon_inode_getfd() because we need to modify
1475          * the f_mode flags directly to allow more than just ioctls
1476          */
1477         fdno = ret = get_unused_fd_flags(O_CLOEXEC);
1478         if (ret < 0)
1479                 goto err_close_device;
1480
1481         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1482                                    device, O_RDWR);
1483         if (IS_ERR(filep)) {
1484                 ret = PTR_ERR(filep);
1485                 goto err_fd;
1486         }
1487
1488         /*
1489          * TODO: add an anon_inode interface to do this.
1490          * Appears to be missing by lack of need rather than
1491          * explicitly prevented.  Now there's need.
1492          */
1493         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1494
1495         atomic_inc(&group->container_users);
1496
1497         fd_install(fdno, filep);
1498
1499         if (group->noiommu)
1500                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1501                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1502         return fdno;
1503
1504 err_fd:
1505         put_unused_fd(fdno);
1506 err_close_device:
1507         mutex_lock(&device->dev_set->lock);
1508         if (device->open_count == 1 && device->ops->close_device)
1509                 device->ops->close_device(device);
1510 err_undo_count:
1511         device->open_count--;
1512         mutex_unlock(&device->dev_set->lock);
1513         module_put(device->dev->driver->owner);
1514 err_device_put:
1515         vfio_device_put(device);
1516         return ret;
1517 }
1518
1519 static long vfio_group_fops_unl_ioctl(struct file *filep,
1520                                       unsigned int cmd, unsigned long arg)
1521 {
1522         struct vfio_group *group = filep->private_data;
1523         long ret = -ENOTTY;
1524
1525         switch (cmd) {
1526         case VFIO_GROUP_GET_STATUS:
1527         {
1528                 struct vfio_group_status status;
1529                 unsigned long minsz;
1530
1531                 minsz = offsetofend(struct vfio_group_status, flags);
1532
1533                 if (copy_from_user(&status, (void __user *)arg, minsz))
1534                         return -EFAULT;
1535
1536                 if (status.argsz < minsz)
1537                         return -EINVAL;
1538
1539                 status.flags = 0;
1540
1541                 if (vfio_group_viable(group))
1542                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1543
1544                 if (group->container)
1545                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1546
1547                 if (copy_to_user((void __user *)arg, &status, minsz))
1548                         return -EFAULT;
1549
1550                 ret = 0;
1551                 break;
1552         }
1553         case VFIO_GROUP_SET_CONTAINER:
1554         {
1555                 int fd;
1556
1557                 if (get_user(fd, (int __user *)arg))
1558                         return -EFAULT;
1559
1560                 if (fd < 0)
1561                         return -EINVAL;
1562
1563                 ret = vfio_group_set_container(group, fd);
1564                 break;
1565         }
1566         case VFIO_GROUP_UNSET_CONTAINER:
1567                 ret = vfio_group_unset_container(group);
1568                 break;
1569         case VFIO_GROUP_GET_DEVICE_FD:
1570         {
1571                 char *buf;
1572
1573                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1574                 if (IS_ERR(buf))
1575                         return PTR_ERR(buf);
1576
1577                 ret = vfio_group_get_device_fd(group, buf);
1578                 kfree(buf);
1579                 break;
1580         }
1581         }
1582
1583         return ret;
1584 }
1585
1586 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1587 {
1588         struct vfio_group *group;
1589         int opened;
1590
1591         group = vfio_group_get_from_minor(iminor(inode));
1592         if (!group)
1593                 return -ENODEV;
1594
1595         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1596                 vfio_group_put(group);
1597                 return -EPERM;
1598         }
1599
1600         /* Do we need multiple instances of the group open?  Seems not. */
1601         opened = atomic_cmpxchg(&group->opened, 0, 1);
1602         if (opened) {
1603                 vfio_group_put(group);
1604                 return -EBUSY;
1605         }
1606
1607         /* Is something still in use from a previous open? */
1608         if (group->container) {
1609                 atomic_dec(&group->opened);
1610                 vfio_group_put(group);
1611                 return -EBUSY;
1612         }
1613
1614         /* Warn if previous user didn't cleanup and re-init to drop them */
1615         if (WARN_ON(group->notifier.head))
1616                 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1617
1618         filep->private_data = group;
1619
1620         return 0;
1621 }
1622
1623 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1624 {
1625         struct vfio_group *group = filep->private_data;
1626
1627         filep->private_data = NULL;
1628
1629         vfio_group_try_dissolve_container(group);
1630
1631         atomic_dec(&group->opened);
1632
1633         vfio_group_put(group);
1634
1635         return 0;
1636 }
1637
1638 static const struct file_operations vfio_group_fops = {
1639         .owner          = THIS_MODULE,
1640         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1641         .compat_ioctl   = compat_ptr_ioctl,
1642         .open           = vfio_group_fops_open,
1643         .release        = vfio_group_fops_release,
1644 };
1645
1646 /**
1647  * VFIO Device fd
1648  */
1649 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1650 {
1651         struct vfio_device *device = filep->private_data;
1652
1653         mutex_lock(&device->dev_set->lock);
1654         if (!--device->open_count && device->ops->close_device)
1655                 device->ops->close_device(device);
1656         mutex_unlock(&device->dev_set->lock);
1657
1658         module_put(device->dev->driver->owner);
1659
1660         vfio_group_try_dissolve_container(device->group);
1661
1662         vfio_device_put(device);
1663
1664         return 0;
1665 }
1666
1667 static long vfio_device_fops_unl_ioctl(struct file *filep,
1668                                        unsigned int cmd, unsigned long arg)
1669 {
1670         struct vfio_device *device = filep->private_data;
1671
1672         if (unlikely(!device->ops->ioctl))
1673                 return -EINVAL;
1674
1675         return device->ops->ioctl(device, cmd, arg);
1676 }
1677
1678 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1679                                      size_t count, loff_t *ppos)
1680 {
1681         struct vfio_device *device = filep->private_data;
1682
1683         if (unlikely(!device->ops->read))
1684                 return -EINVAL;
1685
1686         return device->ops->read(device, buf, count, ppos);
1687 }
1688
1689 static ssize_t vfio_device_fops_write(struct file *filep,
1690                                       const char __user *buf,
1691                                       size_t count, loff_t *ppos)
1692 {
1693         struct vfio_device *device = filep->private_data;
1694
1695         if (unlikely(!device->ops->write))
1696                 return -EINVAL;
1697
1698         return device->ops->write(device, buf, count, ppos);
1699 }
1700
1701 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1702 {
1703         struct vfio_device *device = filep->private_data;
1704
1705         if (unlikely(!device->ops->mmap))
1706                 return -EINVAL;
1707
1708         return device->ops->mmap(device, vma);
1709 }
1710
1711 static const struct file_operations vfio_device_fops = {
1712         .owner          = THIS_MODULE,
1713         .release        = vfio_device_fops_release,
1714         .read           = vfio_device_fops_read,
1715         .write          = vfio_device_fops_write,
1716         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1717         .compat_ioctl   = compat_ptr_ioctl,
1718         .mmap           = vfio_device_fops_mmap,
1719 };
1720
1721 /**
1722  * External user API, exported by symbols to be linked dynamically.
1723  *
1724  * The protocol includes:
1725  *  1. do normal VFIO init operation:
1726  *      - opening a new container;
1727  *      - attaching group(s) to it;
1728  *      - setting an IOMMU driver for a container.
1729  * When IOMMU is set for a container, all groups in it are
1730  * considered ready to use by an external user.
1731  *
1732  * 2. User space passes a group fd to an external user.
1733  * The external user calls vfio_group_get_external_user()
1734  * to verify that:
1735  *      - the group is initialized;
1736  *      - IOMMU is set for it.
1737  * If both checks passed, vfio_group_get_external_user()
1738  * increments the container user counter to prevent
1739  * the VFIO group from disposal before KVM exits.
1740  *
1741  * 3. The external user calls vfio_external_user_iommu_id()
1742  * to know an IOMMU ID.
1743  *
1744  * 4. When the external KVM finishes, it calls
1745  * vfio_group_put_external_user() to release the VFIO group.
1746  * This call decrements the container user counter.
1747  */
1748 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1749 {
1750         struct vfio_group *group = filep->private_data;
1751         int ret;
1752
1753         if (filep->f_op != &vfio_group_fops)
1754                 return ERR_PTR(-EINVAL);
1755
1756         ret = vfio_group_add_container_user(group);
1757         if (ret)
1758                 return ERR_PTR(ret);
1759
1760         vfio_group_get(group);
1761
1762         return group;
1763 }
1764 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1765
1766 /**
1767  * External user API, exported by symbols to be linked dynamically.
1768  * The external user passes in a device pointer
1769  * to verify that:
1770  *      - A VFIO group is assiciated with the device;
1771  *      - IOMMU is set for the group.
1772  * If both checks passed, vfio_group_get_external_user_from_dev()
1773  * increments the container user counter to prevent the VFIO group
1774  * from disposal before external user exits and returns the pointer
1775  * to the VFIO group.
1776  *
1777  * When the external user finishes using the VFIO group, it calls
1778  * vfio_group_put_external_user() to release the VFIO group and
1779  * decrement the container user counter.
1780  *
1781  * @dev [in]    : device
1782  * Return error PTR or pointer to VFIO group.
1783  */
1784
1785 struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1786 {
1787         struct vfio_group *group;
1788         int ret;
1789
1790         group = vfio_group_get_from_dev(dev);
1791         if (!group)
1792                 return ERR_PTR(-ENODEV);
1793
1794         ret = vfio_group_add_container_user(group);
1795         if (ret) {
1796                 vfio_group_put(group);
1797                 return ERR_PTR(ret);
1798         }
1799
1800         return group;
1801 }
1802 EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1803
1804 void vfio_group_put_external_user(struct vfio_group *group)
1805 {
1806         vfio_group_try_dissolve_container(group);
1807         vfio_group_put(group);
1808 }
1809 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1810
1811 bool vfio_external_group_match_file(struct vfio_group *test_group,
1812                                     struct file *filep)
1813 {
1814         struct vfio_group *group = filep->private_data;
1815
1816         return (filep->f_op == &vfio_group_fops) && (group == test_group);
1817 }
1818 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1819
1820 int vfio_external_user_iommu_id(struct vfio_group *group)
1821 {
1822         return iommu_group_id(group->iommu_group);
1823 }
1824 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1825
1826 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1827 {
1828         return vfio_ioctl_check_extension(group->container, arg);
1829 }
1830 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1831
1832 /**
1833  * Sub-module support
1834  */
1835 /*
1836  * Helper for managing a buffer of info chain capabilities, allocate or
1837  * reallocate a buffer with additional @size, filling in @id and @version
1838  * of the capability.  A pointer to the new capability is returned.
1839  *
1840  * NB. The chain is based at the head of the buffer, so new entries are
1841  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1842  * next offsets prior to copying to the user buffer.
1843  */
1844 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1845                                                size_t size, u16 id, u16 version)
1846 {
1847         void *buf;
1848         struct vfio_info_cap_header *header, *tmp;
1849
1850         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1851         if (!buf) {
1852                 kfree(caps->buf);
1853                 caps->buf = NULL;
1854                 caps->size = 0;
1855                 return ERR_PTR(-ENOMEM);
1856         }
1857
1858         caps->buf = buf;
1859         header = buf + caps->size;
1860
1861         /* Eventually copied to user buffer, zero */
1862         memset(header, 0, size);
1863
1864         header->id = id;
1865         header->version = version;
1866
1867         /* Add to the end of the capability chain */
1868         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1869                 ; /* nothing */
1870
1871         tmp->next = caps->size;
1872         caps->size += size;
1873
1874         return header;
1875 }
1876 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1877
1878 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1879 {
1880         struct vfio_info_cap_header *tmp;
1881         void *buf = (void *)caps->buf;
1882
1883         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1884                 tmp->next += offset;
1885 }
1886 EXPORT_SYMBOL(vfio_info_cap_shift);
1887
1888 int vfio_info_add_capability(struct vfio_info_cap *caps,
1889                              struct vfio_info_cap_header *cap, size_t size)
1890 {
1891         struct vfio_info_cap_header *header;
1892
1893         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1894         if (IS_ERR(header))
1895                 return PTR_ERR(header);
1896
1897         memcpy(header + 1, cap + 1, size - sizeof(*header));
1898
1899         return 0;
1900 }
1901 EXPORT_SYMBOL(vfio_info_add_capability);
1902
1903 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1904                                        int max_irq_type, size_t *data_size)
1905 {
1906         unsigned long minsz;
1907         size_t size;
1908
1909         minsz = offsetofend(struct vfio_irq_set, count);
1910
1911         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1912             (hdr->count >= (U32_MAX - hdr->start)) ||
1913             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1914                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1915                 return -EINVAL;
1916
1917         if (data_size)
1918                 *data_size = 0;
1919
1920         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1921                 return -EINVAL;
1922
1923         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1924         case VFIO_IRQ_SET_DATA_NONE:
1925                 size = 0;
1926                 break;
1927         case VFIO_IRQ_SET_DATA_BOOL:
1928                 size = sizeof(uint8_t);
1929                 break;
1930         case VFIO_IRQ_SET_DATA_EVENTFD:
1931                 size = sizeof(int32_t);
1932                 break;
1933         default:
1934                 return -EINVAL;
1935         }
1936
1937         if (size) {
1938                 if (hdr->argsz - minsz < hdr->count * size)
1939                         return -EINVAL;
1940
1941                 if (!data_size)
1942                         return -EINVAL;
1943
1944                 *data_size = hdr->count * size;
1945         }
1946
1947         return 0;
1948 }
1949 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1950
1951 /*
1952  * Pin a set of guest PFNs and return their associated host PFNs for local
1953  * domain only.
1954  * @dev [in]     : device
1955  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1956  * @npage [in]   : count of elements in user_pfn array.  This count should not
1957  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1958  * @prot [in]    : protection flags
1959  * @phys_pfn[out]: array of host PFNs
1960  * Return error or number of pages pinned.
1961  */
1962 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1963                    int prot, unsigned long *phys_pfn)
1964 {
1965         struct vfio_container *container;
1966         struct vfio_group *group;
1967         struct vfio_iommu_driver *driver;
1968         int ret;
1969
1970         if (!dev || !user_pfn || !phys_pfn || !npage)
1971                 return -EINVAL;
1972
1973         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1974                 return -E2BIG;
1975
1976         group = vfio_group_get_from_dev(dev);
1977         if (!group)
1978                 return -ENODEV;
1979
1980         if (group->dev_counter > 1) {
1981                 ret = -EINVAL;
1982                 goto err_pin_pages;
1983         }
1984
1985         ret = vfio_group_add_container_user(group);
1986         if (ret)
1987                 goto err_pin_pages;
1988
1989         container = group->container;
1990         driver = container->iommu_driver;
1991         if (likely(driver && driver->ops->pin_pages))
1992                 ret = driver->ops->pin_pages(container->iommu_data,
1993                                              group->iommu_group, user_pfn,
1994                                              npage, prot, phys_pfn);
1995         else
1996                 ret = -ENOTTY;
1997
1998         vfio_group_try_dissolve_container(group);
1999
2000 err_pin_pages:
2001         vfio_group_put(group);
2002         return ret;
2003 }
2004 EXPORT_SYMBOL(vfio_pin_pages);
2005
2006 /*
2007  * Unpin set of host PFNs for local domain only.
2008  * @dev [in]     : device
2009  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2010  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2011  * @npage [in]   : count of elements in user_pfn array.  This count should not
2012  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2013  * Return error or number of pages unpinned.
2014  */
2015 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2016 {
2017         struct vfio_container *container;
2018         struct vfio_group *group;
2019         struct vfio_iommu_driver *driver;
2020         int ret;
2021
2022         if (!dev || !user_pfn || !npage)
2023                 return -EINVAL;
2024
2025         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2026                 return -E2BIG;
2027
2028         group = vfio_group_get_from_dev(dev);
2029         if (!group)
2030                 return -ENODEV;
2031
2032         ret = vfio_group_add_container_user(group);
2033         if (ret)
2034                 goto err_unpin_pages;
2035
2036         container = group->container;
2037         driver = container->iommu_driver;
2038         if (likely(driver && driver->ops->unpin_pages))
2039                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2040                                                npage);
2041         else
2042                 ret = -ENOTTY;
2043
2044         vfio_group_try_dissolve_container(group);
2045
2046 err_unpin_pages:
2047         vfio_group_put(group);
2048         return ret;
2049 }
2050 EXPORT_SYMBOL(vfio_unpin_pages);
2051
2052 /*
2053  * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
2054  * VFIO group.
2055  *
2056  * The caller needs to call vfio_group_get_external_user() or
2057  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2058  * so as to prevent the VFIO group from disposal in the middle of the call.
2059  * But it can keep the reference to the VFIO group for several calls into
2060  * this interface.
2061  * After finishing using of the VFIO group, the caller needs to release the
2062  * VFIO group by calling vfio_group_put_external_user().
2063  *
2064  * @group [in]          : VFIO group
2065  * @user_iova_pfn [in]  : array of user/guest IOVA PFNs to be pinned.
2066  * @npage [in]          : count of elements in user_iova_pfn array.
2067  *                        This count should not be greater
2068  *                        VFIO_PIN_PAGES_MAX_ENTRIES.
2069  * @prot [in]           : protection flags
2070  * @phys_pfn [out]      : array of host PFNs
2071  * Return error or number of pages pinned.
2072  */
2073 int vfio_group_pin_pages(struct vfio_group *group,
2074                          unsigned long *user_iova_pfn, int npage,
2075                          int prot, unsigned long *phys_pfn)
2076 {
2077         struct vfio_container *container;
2078         struct vfio_iommu_driver *driver;
2079         int ret;
2080
2081         if (!group || !user_iova_pfn || !phys_pfn || !npage)
2082                 return -EINVAL;
2083
2084         if (group->dev_counter > 1)
2085                 return -EINVAL;
2086
2087         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2088                 return -E2BIG;
2089
2090         container = group->container;
2091         driver = container->iommu_driver;
2092         if (likely(driver && driver->ops->pin_pages))
2093                 ret = driver->ops->pin_pages(container->iommu_data,
2094                                              group->iommu_group, user_iova_pfn,
2095                                              npage, prot, phys_pfn);
2096         else
2097                 ret = -ENOTTY;
2098
2099         return ret;
2100 }
2101 EXPORT_SYMBOL(vfio_group_pin_pages);
2102
2103 /*
2104  * Unpin a set of guest IOVA PFNs for a VFIO group.
2105  *
2106  * The caller needs to call vfio_group_get_external_user() or
2107  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2108  * so as to prevent the VFIO group from disposal in the middle of the call.
2109  * But it can keep the reference to the VFIO group for several calls into
2110  * this interface.
2111  * After finishing using of the VFIO group, the caller needs to release the
2112  * VFIO group by calling vfio_group_put_external_user().
2113  *
2114  * @group [in]          : vfio group
2115  * @user_iova_pfn [in]  : array of user/guest IOVA PFNs to be unpinned.
2116  * @npage [in]          : count of elements in user_iova_pfn array.
2117  *                        This count should not be greater than
2118  *                        VFIO_PIN_PAGES_MAX_ENTRIES.
2119  * Return error or number of pages unpinned.
2120  */
2121 int vfio_group_unpin_pages(struct vfio_group *group,
2122                            unsigned long *user_iova_pfn, int npage)
2123 {
2124         struct vfio_container *container;
2125         struct vfio_iommu_driver *driver;
2126         int ret;
2127
2128         if (!group || !user_iova_pfn || !npage)
2129                 return -EINVAL;
2130
2131         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2132                 return -E2BIG;
2133
2134         container = group->container;
2135         driver = container->iommu_driver;
2136         if (likely(driver && driver->ops->unpin_pages))
2137                 ret = driver->ops->unpin_pages(container->iommu_data,
2138                                                user_iova_pfn, npage);
2139         else
2140                 ret = -ENOTTY;
2141
2142         return ret;
2143 }
2144 EXPORT_SYMBOL(vfio_group_unpin_pages);
2145
2146
2147 /*
2148  * This interface allows the CPUs to perform some sort of virtual DMA on
2149  * behalf of the device.
2150  *
2151  * CPUs read/write from/into a range of IOVAs pointing to user space memory
2152  * into/from a kernel buffer.
2153  *
2154  * As the read/write of user space memory is conducted via the CPUs and is
2155  * not a real device DMA, it is not necessary to pin the user space memory.
2156  *
2157  * The caller needs to call vfio_group_get_external_user() or
2158  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2159  * so as to prevent the VFIO group from disposal in the middle of the call.
2160  * But it can keep the reference to the VFIO group for several calls into
2161  * this interface.
2162  * After finishing using of the VFIO group, the caller needs to release the
2163  * VFIO group by calling vfio_group_put_external_user().
2164  *
2165  * @group [in]          : VFIO group
2166  * @user_iova [in]      : base IOVA of a user space buffer
2167  * @data [in]           : pointer to kernel buffer
2168  * @len [in]            : kernel buffer length
2169  * @write               : indicate read or write
2170  * Return error code on failure or 0 on success.
2171  */
2172 int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2173                 void *data, size_t len, bool write)
2174 {
2175         struct vfio_container *container;
2176         struct vfio_iommu_driver *driver;
2177         int ret = 0;
2178
2179         if (!group || !data || len <= 0)
2180                 return -EINVAL;
2181
2182         container = group->container;
2183         driver = container->iommu_driver;
2184
2185         if (likely(driver && driver->ops->dma_rw))
2186                 ret = driver->ops->dma_rw(container->iommu_data,
2187                                           user_iova, data, len, write);
2188         else
2189                 ret = -ENOTTY;
2190
2191         return ret;
2192 }
2193 EXPORT_SYMBOL(vfio_dma_rw);
2194
2195 static int vfio_register_iommu_notifier(struct vfio_group *group,
2196                                         unsigned long *events,
2197                                         struct notifier_block *nb)
2198 {
2199         struct vfio_container *container;
2200         struct vfio_iommu_driver *driver;
2201         int ret;
2202
2203         ret = vfio_group_add_container_user(group);
2204         if (ret)
2205                 return -EINVAL;
2206
2207         container = group->container;
2208         driver = container->iommu_driver;
2209         if (likely(driver && driver->ops->register_notifier))
2210                 ret = driver->ops->register_notifier(container->iommu_data,
2211                                                      events, nb);
2212         else
2213                 ret = -ENOTTY;
2214
2215         vfio_group_try_dissolve_container(group);
2216
2217         return ret;
2218 }
2219
2220 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2221                                           struct notifier_block *nb)
2222 {
2223         struct vfio_container *container;
2224         struct vfio_iommu_driver *driver;
2225         int ret;
2226
2227         ret = vfio_group_add_container_user(group);
2228         if (ret)
2229                 return -EINVAL;
2230
2231         container = group->container;
2232         driver = container->iommu_driver;
2233         if (likely(driver && driver->ops->unregister_notifier))
2234                 ret = driver->ops->unregister_notifier(container->iommu_data,
2235                                                        nb);
2236         else
2237                 ret = -ENOTTY;
2238
2239         vfio_group_try_dissolve_container(group);
2240
2241         return ret;
2242 }
2243
2244 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2245 {
2246         group->kvm = kvm;
2247         blocking_notifier_call_chain(&group->notifier,
2248                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2249 }
2250 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2251
2252 static int vfio_register_group_notifier(struct vfio_group *group,
2253                                         unsigned long *events,
2254                                         struct notifier_block *nb)
2255 {
2256         int ret;
2257         bool set_kvm = false;
2258
2259         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2260                 set_kvm = true;
2261
2262         /* clear known events */
2263         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2264
2265         /* refuse to continue if still events remaining */
2266         if (*events)
2267                 return -EINVAL;
2268
2269         ret = vfio_group_add_container_user(group);
2270         if (ret)
2271                 return -EINVAL;
2272
2273         ret = blocking_notifier_chain_register(&group->notifier, nb);
2274
2275         /*
2276          * The attaching of kvm and vfio_group might already happen, so
2277          * here we replay once upon registration.
2278          */
2279         if (!ret && set_kvm && group->kvm)
2280                 blocking_notifier_call_chain(&group->notifier,
2281                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2282
2283         vfio_group_try_dissolve_container(group);
2284
2285         return ret;
2286 }
2287
2288 static int vfio_unregister_group_notifier(struct vfio_group *group,
2289                                          struct notifier_block *nb)
2290 {
2291         int ret;
2292
2293         ret = vfio_group_add_container_user(group);
2294         if (ret)
2295                 return -EINVAL;
2296
2297         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2298
2299         vfio_group_try_dissolve_container(group);
2300
2301         return ret;
2302 }
2303
2304 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2305                            unsigned long *events, struct notifier_block *nb)
2306 {
2307         struct vfio_group *group;
2308         int ret;
2309
2310         if (!dev || !nb || !events || (*events == 0))
2311                 return -EINVAL;
2312
2313         group = vfio_group_get_from_dev(dev);
2314         if (!group)
2315                 return -ENODEV;
2316
2317         switch (type) {
2318         case VFIO_IOMMU_NOTIFY:
2319                 ret = vfio_register_iommu_notifier(group, events, nb);
2320                 break;
2321         case VFIO_GROUP_NOTIFY:
2322                 ret = vfio_register_group_notifier(group, events, nb);
2323                 break;
2324         default:
2325                 ret = -EINVAL;
2326         }
2327
2328         vfio_group_put(group);
2329         return ret;
2330 }
2331 EXPORT_SYMBOL(vfio_register_notifier);
2332
2333 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2334                              struct notifier_block *nb)
2335 {
2336         struct vfio_group *group;
2337         int ret;
2338
2339         if (!dev || !nb)
2340                 return -EINVAL;
2341
2342         group = vfio_group_get_from_dev(dev);
2343         if (!group)
2344                 return -ENODEV;
2345
2346         switch (type) {
2347         case VFIO_IOMMU_NOTIFY:
2348                 ret = vfio_unregister_iommu_notifier(group, nb);
2349                 break;
2350         case VFIO_GROUP_NOTIFY:
2351                 ret = vfio_unregister_group_notifier(group, nb);
2352                 break;
2353         default:
2354                 ret = -EINVAL;
2355         }
2356
2357         vfio_group_put(group);
2358         return ret;
2359 }
2360 EXPORT_SYMBOL(vfio_unregister_notifier);
2361
2362 struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2363 {
2364         struct vfio_container *container;
2365         struct vfio_iommu_driver *driver;
2366
2367         if (!group)
2368                 return ERR_PTR(-EINVAL);
2369
2370         container = group->container;
2371         driver = container->iommu_driver;
2372         if (likely(driver && driver->ops->group_iommu_domain))
2373                 return driver->ops->group_iommu_domain(container->iommu_data,
2374                                                        group->iommu_group);
2375
2376         return ERR_PTR(-ENOTTY);
2377 }
2378 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2379
2380 /**
2381  * Module/class support
2382  */
2383 static char *vfio_devnode(struct device *dev, umode_t *mode)
2384 {
2385         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2386 }
2387
2388 static struct miscdevice vfio_dev = {
2389         .minor = VFIO_MINOR,
2390         .name = "vfio",
2391         .fops = &vfio_fops,
2392         .nodename = "vfio/vfio",
2393         .mode = S_IRUGO | S_IWUGO,
2394 };
2395
2396 static int __init vfio_init(void)
2397 {
2398         int ret;
2399
2400         idr_init(&vfio.group_idr);
2401         mutex_init(&vfio.group_lock);
2402         mutex_init(&vfio.iommu_drivers_lock);
2403         INIT_LIST_HEAD(&vfio.group_list);
2404         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2405
2406         ret = misc_register(&vfio_dev);
2407         if (ret) {
2408                 pr_err("vfio: misc device register failed\n");
2409                 return ret;
2410         }
2411
2412         /* /dev/vfio/$GROUP */
2413         vfio.class = class_create(THIS_MODULE, "vfio");
2414         if (IS_ERR(vfio.class)) {
2415                 ret = PTR_ERR(vfio.class);
2416                 goto err_class;
2417         }
2418
2419         vfio.class->devnode = vfio_devnode;
2420
2421         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2422         if (ret)
2423                 goto err_alloc_chrdev;
2424
2425         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2426         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2427         if (ret)
2428                 goto err_cdev_add;
2429
2430         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2431
2432 #ifdef CONFIG_VFIO_NOIOMMU
2433         vfio_register_iommu_driver(&vfio_noiommu_ops);
2434 #endif
2435         return 0;
2436
2437 err_cdev_add:
2438         unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2439 err_alloc_chrdev:
2440         class_destroy(vfio.class);
2441         vfio.class = NULL;
2442 err_class:
2443         misc_deregister(&vfio_dev);
2444         return ret;
2445 }
2446
2447 static void __exit vfio_cleanup(void)
2448 {
2449         WARN_ON(!list_empty(&vfio.group_list));
2450
2451 #ifdef CONFIG_VFIO_NOIOMMU
2452         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2453 #endif
2454         idr_destroy(&vfio.group_idr);
2455         cdev_del(&vfio.group_cdev);
2456         unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2457         class_destroy(vfio.class);
2458         vfio.class = NULL;
2459         misc_deregister(&vfio_dev);
2460         xa_destroy(&vfio_device_set_xa);
2461 }
2462
2463 module_init(vfio_init);
2464 module_exit(vfio_cleanup);
2465
2466 MODULE_VERSION(DRIVER_VERSION);
2467 MODULE_LICENSE("GPL v2");
2468 MODULE_AUTHOR(DRIVER_AUTHOR);
2469 MODULE_DESCRIPTION(DRIVER_DESC);
2470 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2471 MODULE_ALIAS("devname:vfio/vfio");
2472 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");