Merge tag 'vfio-v5.4-rc1' of git://github.com/awilliam/linux-vfio
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 20 Sep 2019 22:06:13 +0000 (15:06 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 20 Sep 2019 22:06:13 +0000 (15:06 -0700)
Pull VFIO updates from Alex Williamson:

 - Fix spapr iommu error case case (Alexey Kardashevskiy)

 - Consolidate region type definitions (Cornelia Huck)

 - Restore saved original PCI state on release (hexin)

 - Simplify mtty sample driver interrupt path (Parav Pandit)

 - Support for reporting valid IOVA regions to user (Shameer Kolothum)

* tag 'vfio-v5.4-rc1' of git://github.com/awilliam/linux-vfio:
  vfio_pci: Restore original state on release
  vfio/type1: remove duplicate retrieval of reserved regions
  vfio/type1: Add IOVA range capability support
  vfio/type1: check dma map request is within a valid iova range
  vfio/spapr_tce: Fix incorrect tce_iommu_group memory free
  vfio-mdev/mtty: Simplify interrupt generation
  vfio: re-arrange vfio region definitions
  vfio/type1: Update iova list on detach
  vfio/type1: Check reserved region conflict and update iova list
  vfio/type1: Introduce iova list and add iommu aperture validity check

1  2 
drivers/vfio/vfio_iommu_spapr_tce.c
drivers/vfio/vfio_iommu_type1.c

@@@ -435,7 -435,7 +435,7 @@@ static int tce_iommu_clear(struct tce_c
        unsigned long oldhpa;
        long ret;
        enum dma_data_direction direction;
 -      unsigned long lastentry = entry + pages;
 +      unsigned long lastentry = entry + pages, firstentry = entry;
  
        for ( ; entry < lastentry; ++entry) {
                if (tbl->it_indirect_levels && tbl->it_userspace) {
  
                direction = DMA_NONE;
                oldhpa = 0;
 -              ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa,
 +              ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
                                &direction);
                if (ret)
                        continue;
                tce_iommu_unuse_page(container, oldhpa);
        }
  
 +      iommu_tce_kill(tbl, firstentry, pages);
 +
        return 0;
  }
  
@@@ -520,8 -518,8 +520,8 @@@ static long tce_iommu_build(struct tce_
  
                hpa |= offset;
                dirtmp = direction;
 -              ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
 -                              &dirtmp);
 +              ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 +                              &hpa, &dirtmp);
                if (ret) {
                        tce_iommu_unuse_page(container, hpa);
                        pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
  
        if (ret)
                tce_iommu_clear(container, tbl, entry, i);
 +      else
 +              iommu_tce_kill(tbl, entry, pages);
  
        return ret;
  }
@@@ -576,8 -572,8 +576,8 @@@ static long tce_iommu_build_v2(struct t
                if (mm_iommu_mapped_inc(mem))
                        break;
  
 -              ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
 -                              &dirtmp);
 +              ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 +                              &hpa, &dirtmp);
                if (ret) {
                        /* dirtmp cannot be DMA_NONE here */
                        tce_iommu_unuse_page_v2(container, tbl, entry + i);
  
        if (ret)
                tce_iommu_clear(container, tbl, entry, i);
 +      else
 +              iommu_tce_kill(tbl, entry, pages);
  
        return ret;
  }
@@@ -1240,7 -1234,7 +1240,7 @@@ release_exit
  static int tce_iommu_attach_group(void *iommu_data,
                struct iommu_group *iommu_group)
  {
-       int ret;
+       int ret = 0;
        struct tce_container *container = iommu_data;
        struct iommu_table_group *table_group;
        struct tce_iommu_group *tcegrp = NULL;
                        !table_group->ops->release_ownership) {
                if (container->v2) {
                        ret = -EPERM;
-                       goto unlock_exit;
+                       goto free_exit;
                }
                ret = tce_iommu_take_ownership(container, table_group);
        } else {
                if (!container->v2) {
                        ret = -EPERM;
-                       goto unlock_exit;
+                       goto free_exit;
                }
                ret = tce_iommu_take_ownership_ddw(container, table_group);
                if (!tce_groups_attached(container) && !container->tables[0])
                list_add(&tcegrp->next, &container->group_list);
        }
  
unlock_exit:
free_exit:
        if (ret && tcegrp)
                kfree(tcegrp);
  
+ unlock_exit:
        mutex_unlock(&container->lock);
  
        return ret;
@@@ -62,6 -62,7 +62,7 @@@ MODULE_PARM_DESC(dma_entry_limit
  
  struct vfio_iommu {
        struct list_head        domain_list;
+       struct list_head        iova_list;
        struct vfio_domain      *external_domain; /* domain for external user */
        struct mutex            lock;
        struct rb_root          dma_list;
@@@ -97,6 -98,12 +98,12 @@@ struct vfio_group 
        bool                    mdev_group;     /* An mdev group */
  };
  
+ struct vfio_iova {
+       struct list_head        list;
+       dma_addr_t              start;
+       dma_addr_t              end;
+ };
  /*
   * Guest RAM pinning working set or DMA target
   */
@@@ -650,13 -657,12 +657,13 @@@ unpin_exit
  }
  
  static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
 -                              struct list_head *regions)
 +                          struct list_head *regions,
 +                          struct iommu_iotlb_gather *iotlb_gather)
  {
        long unlocked = 0;
        struct vfio_regions *entry, *next;
  
 -      iommu_tlb_sync(domain->domain);
 +      iommu_tlb_sync(domain->domain, iotlb_gather);
  
        list_for_each_entry_safe(entry, next, regions, list) {
                unlocked += vfio_unpin_pages_remote(dma,
@@@ -686,19 -692,18 +693,19 @@@ static size_t unmap_unpin_fast(struct v
                               struct vfio_dma *dma, dma_addr_t *iova,
                               size_t len, phys_addr_t phys, long *unlocked,
                               struct list_head *unmapped_list,
 -                             int *unmapped_cnt)
 +                             int *unmapped_cnt,
 +                             struct iommu_iotlb_gather *iotlb_gather)
  {
        size_t unmapped = 0;
        struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
  
        if (entry) {
 -              unmapped = iommu_unmap_fast(domain->domain, *iova, len);
 +              unmapped = iommu_unmap_fast(domain->domain, *iova, len,
 +                                          iotlb_gather);
  
                if (!unmapped) {
                        kfree(entry);
                } else {
 -                      iommu_tlb_range_add(domain->domain, *iova, unmapped);
                        entry->iova = *iova;
                        entry->phys = phys;
                        entry->len  = unmapped;
         * or in case of errors.
         */
        if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
 -              *unlocked += vfio_sync_unpin(dma, domain,
 -                                           unmapped_list);
 +              *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
 +                                           iotlb_gather);
                *unmapped_cnt = 0;
        }
  
@@@ -746,7 -751,6 +753,7 @@@ static long vfio_unmap_unpin(struct vfi
        dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
        struct vfio_domain *domain, *d;
        LIST_HEAD(unmapped_region_list);
 +      struct iommu_iotlb_gather iotlb_gather;
        int unmapped_region_cnt = 0;
        long unlocked = 0;
  
                cond_resched();
        }
  
 +      iommu_iotlb_gather_init(&iotlb_gather);
        while (iova < end) {
                size_t unmapped, len;
                phys_addr_t phys, next;
                 */
                unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
                                            &unlocked, &unmapped_region_list,
 -                                          &unmapped_region_cnt);
 +                                          &unmapped_region_cnt,
 +                                          &iotlb_gather);
                if (!unmapped) {
                        unmapped = unmap_unpin_slow(domain, dma, &iova, len,
                                                    phys, &unlocked);
  
        dma->iommu_mapped = false;
  
 -      if (unmapped_region_cnt)
 -              unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list);
 +      if (unmapped_region_cnt) {
 +              unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
 +                                          &iotlb_gather);
 +      }
  
        if (do_accounting) {
                vfio_lock_acct(dma, -unlocked, true);
@@@ -1038,6 -1038,27 +1045,27 @@@ static int vfio_pin_map_dma(struct vfio
        return ret;
  }
  
+ /*
+  * Check dma map request is within a valid iova range
+  */
+ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+                                     dma_addr_t start, dma_addr_t end)
+ {
+       struct list_head *iova = &iommu->iova_list;
+       struct vfio_iova *node;
+       list_for_each_entry(node, iova, list) {
+               if (start >= node->start && end <= node->end)
+                       return true;
+       }
+       /*
+        * Check for list_empty() as well since a container with
+        * a single mdev device will have an empty list.
+        */
+       return list_empty(iova);
+ }
  static int vfio_dma_do_map(struct vfio_iommu *iommu,
                           struct vfio_iommu_type1_dma_map *map)
  {
                goto out_unlock;
        }
  
+       if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+               ret = -EINVAL;
+               goto out_unlock;
+       }
        dma = kzalloc(sizeof(*dma), GFP_KERNEL);
        if (!dma) {
                ret = -ENOMEM;
@@@ -1270,15 -1296,13 +1303,13 @@@ static struct vfio_group *find_iommu_gr
        return NULL;
  }
  
- static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
+ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
+                                 phys_addr_t *base)
  {
-       struct list_head group_resv_regions;
-       struct iommu_resv_region *region, *next;
+       struct iommu_resv_region *region;
        bool ret = false;
  
-       INIT_LIST_HEAD(&group_resv_regions);
-       iommu_get_group_resv_regions(group, &group_resv_regions);
-       list_for_each_entry(region, &group_resv_regions, list) {
+       list_for_each_entry(region, group_resv_regions, list) {
                /*
                 * The presence of any 'real' MSI regions should take
                 * precedence over the software-managed one if the
                        ret = true;
                }
        }
-       list_for_each_entry_safe(region, next, &group_resv_regions, list)
-               kfree(region);
        return ret;
  }
  
@@@ -1395,6 -1418,228 +1425,228 @@@ static int vfio_mdev_iommu_device(struc
        return 0;
  }
  
+ /*
+  * This is a helper function to insert an address range to iova list.
+  * The list is initially created with a single entry corresponding to
+  * the IOMMU domain geometry to which the device group is attached.
+  * The list aperture gets modified when a new domain is added to the
+  * container if the new aperture doesn't conflict with the current one
+  * or with any existing dma mappings. The list is also modified to
+  * exclude any reserved regions associated with the device group.
+  */
+ static int vfio_iommu_iova_insert(struct list_head *head,
+                                 dma_addr_t start, dma_addr_t end)
+ {
+       struct vfio_iova *region;
+       region = kmalloc(sizeof(*region), GFP_KERNEL);
+       if (!region)
+               return -ENOMEM;
+       INIT_LIST_HEAD(&region->list);
+       region->start = start;
+       region->end = end;
+       list_add_tail(&region->list, head);
+       return 0;
+ }
+ /*
+  * Check the new iommu aperture conflicts with existing aper or with any
+  * existing dma mappings.
+  */
+ static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
+                                    dma_addr_t start, dma_addr_t end)
+ {
+       struct vfio_iova *first, *last;
+       struct list_head *iova = &iommu->iova_list;
+       if (list_empty(iova))
+               return false;
+       /* Disjoint sets, return conflict */
+       first = list_first_entry(iova, struct vfio_iova, list);
+       last = list_last_entry(iova, struct vfio_iova, list);
+       if (start > last->end || end < first->start)
+               return true;
+       /* Check for any existing dma mappings below the new start */
+       if (start > first->start) {
+               if (vfio_find_dma(iommu, first->start, start - first->start))
+                       return true;
+       }
+       /* Check for any existing dma mappings beyond the new end */
+       if (end < last->end) {
+               if (vfio_find_dma(iommu, end + 1, last->end - end))
+                       return true;
+       }
+       return false;
+ }
+ /*
+  * Resize iommu iova aperture window. This is called only if the new
+  * aperture has no conflict with existing aperture and dma mappings.
+  */
+ static int vfio_iommu_aper_resize(struct list_head *iova,
+                                 dma_addr_t start, dma_addr_t end)
+ {
+       struct vfio_iova *node, *next;
+       if (list_empty(iova))
+               return vfio_iommu_iova_insert(iova, start, end);
+       /* Adjust iova list start */
+       list_for_each_entry_safe(node, next, iova, list) {
+               if (start < node->start)
+                       break;
+               if (start >= node->start && start < node->end) {
+                       node->start = start;
+                       break;
+               }
+               /* Delete nodes before new start */
+               list_del(&node->list);
+               kfree(node);
+       }
+       /* Adjust iova list end */
+       list_for_each_entry_safe(node, next, iova, list) {
+               if (end > node->end)
+                       continue;
+               if (end > node->start && end <= node->end) {
+                       node->end = end;
+                       continue;
+               }
+               /* Delete nodes after new end */
+               list_del(&node->list);
+               kfree(node);
+       }
+       return 0;
+ }
+ /*
+  * Check reserved region conflicts with existing dma mappings
+  */
+ static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
+                                    struct list_head *resv_regions)
+ {
+       struct iommu_resv_region *region;
+       /* Check for conflict with existing dma mappings */
+       list_for_each_entry(region, resv_regions, list) {
+               if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
+                       continue;
+               if (vfio_find_dma(iommu, region->start, region->length))
+                       return true;
+       }
+       return false;
+ }
+ /*
+  * Check iova region overlap with  reserved regions and
+  * exclude them from the iommu iova range
+  */
+ static int vfio_iommu_resv_exclude(struct list_head *iova,
+                                  struct list_head *resv_regions)
+ {
+       struct iommu_resv_region *resv;
+       struct vfio_iova *n, *next;
+       list_for_each_entry(resv, resv_regions, list) {
+               phys_addr_t start, end;
+               if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
+                       continue;
+               start = resv->start;
+               end = resv->start + resv->length - 1;
+               list_for_each_entry_safe(n, next, iova, list) {
+                       int ret = 0;
+                       /* No overlap */
+                       if (start > n->end || end < n->start)
+                               continue;
+                       /*
+                        * Insert a new node if current node overlaps with the
+                        * reserve region to exlude that from valid iova range.
+                        * Note that, new node is inserted before the current
+                        * node and finally the current node is deleted keeping
+                        * the list updated and sorted.
+                        */
+                       if (start > n->start)
+                               ret = vfio_iommu_iova_insert(&n->list, n->start,
+                                                            start - 1);
+                       if (!ret && end < n->end)
+                               ret = vfio_iommu_iova_insert(&n->list, end + 1,
+                                                            n->end);
+                       if (ret)
+                               return ret;
+                       list_del(&n->list);
+                       kfree(n);
+               }
+       }
+       if (list_empty(iova))
+               return -EINVAL;
+       return 0;
+ }
+ static void vfio_iommu_resv_free(struct list_head *resv_regions)
+ {
+       struct iommu_resv_region *n, *next;
+       list_for_each_entry_safe(n, next, resv_regions, list) {
+               list_del(&n->list);
+               kfree(n);
+       }
+ }
+ static void vfio_iommu_iova_free(struct list_head *iova)
+ {
+       struct vfio_iova *n, *next;
+       list_for_each_entry_safe(n, next, iova, list) {
+               list_del(&n->list);
+               kfree(n);
+       }
+ }
+ static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
+                                   struct list_head *iova_copy)
+ {
+       struct list_head *iova = &iommu->iova_list;
+       struct vfio_iova *n;
+       int ret;
+       list_for_each_entry(n, iova, list) {
+               ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
+               if (ret)
+                       goto out_free;
+       }
+       return 0;
+ out_free:
+       vfio_iommu_iova_free(iova_copy);
+       return ret;
+ }
+ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
+                                       struct list_head *iova_copy)
+ {
+       struct list_head *iova = &iommu->iova_list;
+       vfio_iommu_iova_free(iova);
+       list_splice_tail(iova_copy, iova);
+ }
  static int vfio_iommu_type1_attach_group(void *iommu_data,
                                         struct iommu_group *iommu_group)
  {
        int ret;
        bool resv_msi, msi_remap;
        phys_addr_t resv_msi_base;
+       struct iommu_domain_geometry geo;
+       LIST_HEAD(iova_copy);
+       LIST_HEAD(group_resv_regions);
  
        mutex_lock(&iommu->lock);
  
        if (ret)
                goto out_domain;
  
-       resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
+       /* Get aperture info */
+       iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
+       if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
+                                    geo.aperture_end)) {
+               ret = -EINVAL;
+               goto out_detach;
+       }
+       ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
+       if (ret)
+               goto out_detach;
+       if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
+               ret = -EINVAL;
+               goto out_detach;
+       }
+       /*
+        * We don't want to work on the original iova list as the list
+        * gets modified and in case of failure we have to retain the
+        * original list. Get a copy here.
+        */
+       ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
+       if (ret)
+               goto out_detach;
+       ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
+                                    geo.aperture_end);
+       if (ret)
+               goto out_detach;
+       ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
+       if (ret)
+               goto out_detach;
+       resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
  
        INIT_LIST_HEAD(&domain->group_list);
        list_add(&group->next, &domain->group_list);
                                list_add(&group->next, &d->group_list);
                                iommu_domain_free(domain->domain);
                                kfree(domain);
-                               mutex_unlock(&iommu->lock);
-                               return 0;
+                               goto done;
                        }
  
                        ret = vfio_iommu_attach_group(domain, group);
        }
  
        list_add(&domain->next, &iommu->domain_list);
+ done:
+       /* Delete the old one and insert new iova list */
+       vfio_iommu_iova_insert_copy(iommu, &iova_copy);
        mutex_unlock(&iommu->lock);
+       vfio_iommu_resv_free(&group_resv_regions);
  
        return 0;
  
@@@ -1547,6 -1833,8 +1840,8 @@@ out_detach
        vfio_iommu_detach_group(domain, group);
  out_domain:
        iommu_domain_free(domain->domain);
+       vfio_iommu_iova_free(&iova_copy);
+       vfio_iommu_resv_free(&group_resv_regions);
  out_free:
        kfree(domain);
        kfree(group);
@@@ -1602,12 -1890,93 +1897,93 @@@ static void vfio_sanity_check_pfn_list(
        WARN_ON(iommu->notifier.head);
  }
  
+ /*
+  * Called when a domain is removed in detach. It is possible that
+  * the removed domain decided the iova aperture window. Modify the
+  * iova aperture with the smallest window among existing domains.
+  */
+ static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
+                                  struct list_head *iova_copy)
+ {
+       struct vfio_domain *domain;
+       struct iommu_domain_geometry geo;
+       struct vfio_iova *node;
+       dma_addr_t start = 0;
+       dma_addr_t end = (dma_addr_t)~0;
+       if (list_empty(iova_copy))
+               return;
+       list_for_each_entry(domain, &iommu->domain_list, next) {
+               iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
+                                     &geo);
+               if (geo.aperture_start > start)
+                       start = geo.aperture_start;
+               if (geo.aperture_end < end)
+                       end = geo.aperture_end;
+       }
+       /* Modify aperture limits. The new aper is either same or bigger */
+       node = list_first_entry(iova_copy, struct vfio_iova, list);
+       node->start = start;
+       node = list_last_entry(iova_copy, struct vfio_iova, list);
+       node->end = end;
+ }
+ /*
+  * Called when a group is detached. The reserved regions for that
+  * group can be part of valid iova now. But since reserved regions
+  * may be duplicated among groups, populate the iova valid regions
+  * list again.
+  */
+ static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
+                                  struct list_head *iova_copy)
+ {
+       struct vfio_domain *d;
+       struct vfio_group *g;
+       struct vfio_iova *node;
+       dma_addr_t start, end;
+       LIST_HEAD(resv_regions);
+       int ret;
+       if (list_empty(iova_copy))
+               return -EINVAL;
+       list_for_each_entry(d, &iommu->domain_list, next) {
+               list_for_each_entry(g, &d->group_list, next) {
+                       ret = iommu_get_group_resv_regions(g->iommu_group,
+                                                          &resv_regions);
+                       if (ret)
+                               goto done;
+               }
+       }
+       node = list_first_entry(iova_copy, struct vfio_iova, list);
+       start = node->start;
+       node = list_last_entry(iova_copy, struct vfio_iova, list);
+       end = node->end;
+       /* purge the iova list and create new one */
+       vfio_iommu_iova_free(iova_copy);
+       ret = vfio_iommu_aper_resize(iova_copy, start, end);
+       if (ret)
+               goto done;
+       /* Exclude current reserved regions from iova ranges */
+       ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
+ done:
+       vfio_iommu_resv_free(&resv_regions);
+       return ret;
+ }
  static void vfio_iommu_type1_detach_group(void *iommu_data,
                                          struct iommu_group *iommu_group)
  {
        struct vfio_iommu *iommu = iommu_data;
        struct vfio_domain *domain;
        struct vfio_group *group;
+       LIST_HEAD(iova_copy);
  
        mutex_lock(&iommu->lock);
  
                }
        }
  
+       /*
+        * Get a copy of iova list. This will be used to update
+        * and to replace the current one later. Please note that
+        * we will leave the original list as it is if update fails.
+        */
+       vfio_iommu_iova_get_copy(iommu, &iova_copy);
        list_for_each_entry(domain, &iommu->domain_list, next) {
                group = find_iommu_group(domain, iommu_group);
                if (!group)
                        iommu_domain_free(domain->domain);
                        list_del(&domain->next);
                        kfree(domain);
+                       vfio_iommu_aper_expand(iommu, &iova_copy);
                }
                break;
        }
  
+       if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
+               vfio_iommu_iova_insert_copy(iommu, &iova_copy);
+       else
+               vfio_iommu_iova_free(&iova_copy);
  detach_group_done:
        mutex_unlock(&iommu->lock);
  }
@@@ -1686,6 -2068,7 +2075,7 @@@ static void *vfio_iommu_type1_open(unsi
        }
  
        INIT_LIST_HEAD(&iommu->domain_list);
+       INIT_LIST_HEAD(&iommu->iova_list);
        iommu->dma_list = RB_ROOT;
        iommu->dma_avail = dma_entry_limit;
        mutex_init(&iommu->lock);
@@@ -1729,6 -2112,9 +2119,9 @@@ static void vfio_iommu_type1_release(vo
                list_del(&domain->next);
                kfree(domain);
        }
+       vfio_iommu_iova_free(&iommu->iova_list);
        kfree(iommu);
  }
  
@@@ -1749,6 -2135,73 +2142,73 @@@ static int vfio_domains_have_iommu_cach
        return ret;
  }
  
+ static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
+                struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
+                size_t size)
+ {
+       struct vfio_info_cap_header *header;
+       struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
+       header = vfio_info_cap_add(caps, size,
+                                  VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
+       if (IS_ERR(header))
+               return PTR_ERR(header);
+       iova_cap = container_of(header,
+                               struct vfio_iommu_type1_info_cap_iova_range,
+                               header);
+       iova_cap->nr_iovas = cap_iovas->nr_iovas;
+       memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
+              cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
+       return 0;
+ }
+ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
+                                     struct vfio_info_cap *caps)
+ {
+       struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
+       struct vfio_iova *iova;
+       size_t size;
+       int iovas = 0, i = 0, ret;
+       mutex_lock(&iommu->lock);
+       list_for_each_entry(iova, &iommu->iova_list, list)
+               iovas++;
+       if (!iovas) {
+               /*
+                * Return 0 as a container with a single mdev device
+                * will have an empty list
+                */
+               ret = 0;
+               goto out_unlock;
+       }
+       size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
+       cap_iovas = kzalloc(size, GFP_KERNEL);
+       if (!cap_iovas) {
+               ret = -ENOMEM;
+               goto out_unlock;
+       }
+       cap_iovas->nr_iovas = iovas;
+       list_for_each_entry(iova, &iommu->iova_list, list) {
+               cap_iovas->iova_ranges[i].start = iova->start;
+               cap_iovas->iova_ranges[i].end = iova->end;
+               i++;
+       }
+       ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
+       kfree(cap_iovas);
+ out_unlock:
+       mutex_unlock(&iommu->lock);
+       return ret;
+ }
  static long vfio_iommu_type1_ioctl(void *iommu_data,
                                   unsigned int cmd, unsigned long arg)
  {
                }
        } else if (cmd == VFIO_IOMMU_GET_INFO) {
                struct vfio_iommu_type1_info info;
+               struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+               unsigned long capsz;
+               int ret;
  
                minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
  
+               /* For backward compatibility, cannot require this */
+               capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
                if (copy_from_user(&info, (void __user *)arg, minsz))
                        return -EFAULT;
  
                if (info.argsz < minsz)
                        return -EINVAL;
  
+               if (info.argsz >= capsz) {
+                       minsz = capsz;
+                       info.cap_offset = 0; /* output, no-recopy necessary */
+               }
                info.flags = VFIO_IOMMU_INFO_PGSIZES;
  
                info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
  
+               ret = vfio_iommu_iova_build_caps(iommu, &caps);
+               if (ret)
+                       return ret;
+               if (caps.size) {
+                       info.flags |= VFIO_IOMMU_INFO_CAPS;
+                       if (info.argsz < sizeof(info) + caps.size) {
+                               info.argsz = sizeof(info) + caps.size;
+                       } else {
+                               vfio_info_cap_shift(&caps, sizeof(info));
+                               if (copy_to_user((void __user *)arg +
+                                               sizeof(info), caps.buf,
+                                               caps.size)) {
+                                       kfree(caps.buf);
+                                       return -EFAULT;
+                               }
+                               info.cap_offset = sizeof(info);
+                       }
+                       kfree(caps.buf);
+               }
                return copy_to_user((void __user *)arg, &info, minsz) ?
                        -EFAULT : 0;