iommu: Make __iommu_group_set_domain() handle error unwind
authorJason Gunthorpe <jgg@nvidia.com>
Thu, 11 May 2023 04:42:01 +0000 (01:42 -0300)
committerJoerg Roedel <jroedel@suse.de>
Tue, 23 May 2023 06:15:51 +0000 (08:15 +0200)
Let's try to have a consistent and clear strategy for error handling
during domain attach failures.

There are two broad categories, the first is callers doing destruction and
trying to set the domain back to a previously good domain. These cases
cannot handle failure during destruction flows and must succeed, or at
least avoid a UAF on the current group->domain which is likely about to be
freed.

Many of the drivers are well behaved here and will not hit the WARN_ON's
or a UAF, but some are doing hypercalls/etc that can fail unpredictably
and don't meet the expectations.

The second case is attaching a domain for the first time in a failable
context, failure should restore the attachment back to group->domain using
the above unfailable operation.

Have __iommu_group_set_domain_internal() execute a common algorithm that
tries to achieve this, and in the worst case, would leave a device
"detached" or assigned to a global blocking domain. This relies on some
existing common driver behaviors where attach failure will also do detatch
and true IOMMU_DOMAIN_BLOCK implementations that are not allowed to ever
fail.

Name the first case with __iommu_group_set_domain_nofail() to make it
clear.

Pull all the error handling and WARN_ON generation into
__iommu_group_set_domain_internal().

Avoid the obfuscating use of __iommu_group_for_each_dev() and be more
careful about what should happen during failures by only touching devices
we've already touched.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v5-1b99ae392328+44574-iommu_err_unwind_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
drivers/iommu/iommu.c

index e806f4c..74cb162 100644 (file)
@@ -101,8 +101,26 @@ static int __iommu_attach_device(struct iommu_domain *domain,
                                 struct device *dev);
 static int __iommu_attach_group(struct iommu_domain *domain,
                                struct iommu_group *group);
+
+enum {
+       IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0,
+};
+
+static int __iommu_group_set_domain_internal(struct iommu_group *group,
+                                            struct iommu_domain *new_domain,
+                                            unsigned int flags);
 static int __iommu_group_set_domain(struct iommu_group *group,
-                                   struct iommu_domain *new_domain);
+                                   struct iommu_domain *new_domain)
+{
+       return __iommu_group_set_domain_internal(group, new_domain, 0);
+}
+static void __iommu_group_set_domain_nofail(struct iommu_group *group,
+                                           struct iommu_domain *new_domain)
+{
+       WARN_ON(__iommu_group_set_domain_internal(
+               group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED));
+}
+
 static int iommu_create_device_direct_mappings(struct iommu_group *group,
                                               struct device *dev);
 static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
@@ -2022,15 +2040,13 @@ EXPORT_SYMBOL_GPL(iommu_domain_free);
 static void __iommu_group_set_core_domain(struct iommu_group *group)
 {
        struct iommu_domain *new_domain;
-       int ret;
 
        if (group->owner)
                new_domain = group->blocking_domain;
        else
                new_domain = group->default_domain;
 
-       ret = __iommu_group_set_domain(group, new_domain);
-       WARN(ret, "iommu driver failed to attach the default/blocking domain");
+       __iommu_group_set_domain_nofail(group, new_domain);
 }
 
 static int __iommu_attach_device(struct iommu_domain *domain,
@@ -2215,21 +2231,55 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_group);
 
-static int iommu_group_do_set_platform_dma(struct device *dev, void *data)
+static int __iommu_device_set_domain(struct iommu_group *group,
+                                    struct device *dev,
+                                    struct iommu_domain *new_domain,
+                                    unsigned int flags)
 {
-       const struct iommu_ops *ops = dev_iommu_ops(dev);
-
-       if (!WARN_ON(!ops->set_platform_dma_ops))
-               ops->set_platform_dma_ops(dev);
+       int ret;
 
+       ret = __iommu_attach_device(new_domain, dev);
+       if (ret) {
+               /*
+                * If we have a blocking domain then try to attach that in hopes
+                * of avoiding a UAF. Modern drivers should implement blocking
+                * domains as global statics that cannot fail.
+                */
+               if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) &&
+                   group->blocking_domain &&
+                   group->blocking_domain != new_domain)
+                       __iommu_attach_device(group->blocking_domain, dev);
+               return ret;
+       }
        return 0;
 }
 
-static int __iommu_group_set_domain(struct iommu_group *group,
-                                   struct iommu_domain *new_domain)
+/*
+ * If 0 is returned the group's domain is new_domain. If an error is returned
+ * then the group's domain will be set back to the existing domain unless
+ * IOMMU_SET_DOMAIN_MUST_SUCCEED, otherwise an error is returned and the group's
+ * domains is left inconsistent. This is a driver bug to fail attach with a
+ * previously good domain. We try to avoid a kernel UAF because of this.
+ *
+ * IOMMU groups are really the natural working unit of the IOMMU, but the IOMMU
+ * API works on domains and devices.  Bridge that gap by iterating over the
+ * devices in a group.  Ideally we'd have a single device which represents the
+ * requestor ID of the group, but we also allow IOMMU drivers to create policy
+ * defined minimum sets, where the physical hardware may be able to distiguish
+ * members, but we wish to group them at a higher level (ex. untrusted
+ * multi-function PCI devices).  Thus we attach each device.
+ */
+static int __iommu_group_set_domain_internal(struct iommu_group *group,
+                                            struct iommu_domain *new_domain,
+                                            unsigned int flags)
 {
+       struct group_device *last_gdev;
+       struct group_device *gdev;
+       int result;
        int ret;
 
+       lockdep_assert_held(&group->mutex);
+
        if (group->domain == new_domain)
                return 0;
 
@@ -2239,8 +2289,12 @@ static int __iommu_group_set_domain(struct iommu_group *group,
         * platform specific behavior.
         */
        if (!new_domain) {
-               __iommu_group_for_each_dev(group, NULL,
-                                          iommu_group_do_set_platform_dma);
+               for_each_group_device(group, gdev) {
+                       const struct iommu_ops *ops = dev_iommu_ops(gdev->dev);
+
+                       if (!WARN_ON(!ops->set_platform_dma_ops))
+                               ops->set_platform_dma_ops(gdev->dev);
+               }
                group->domain = NULL;
                return 0;
        }
@@ -2250,16 +2304,52 @@ static int __iommu_group_set_domain(struct iommu_group *group,
         * domain. This switch does not have to be atomic and DMA can be
         * discarded during the transition. DMA must only be able to access
         * either new_domain or group->domain, never something else.
-        *
-        * Note that this is called in error unwind paths, attaching to a
-        * domain that has already been attached cannot fail.
         */
-       ret = __iommu_group_for_each_dev(group, new_domain,
-                                        iommu_group_do_attach_device);
-       if (ret)
-               return ret;
+       result = 0;
+       for_each_group_device(group, gdev) {
+               ret = __iommu_device_set_domain(group, gdev->dev, new_domain,
+                                               flags);
+               if (ret) {
+                       result = ret;
+                       /*
+                        * Keep trying the other devices in the group. If a
+                        * driver fails attach to an otherwise good domain, and
+                        * does not support blocking domains, it should at least
+                        * drop its reference on the current domain so we don't
+                        * UAF.
+                        */
+                       if (flags & IOMMU_SET_DOMAIN_MUST_SUCCEED)
+                               continue;
+                       goto err_revert;
+               }
+       }
        group->domain = new_domain;
-       return 0;
+       return result;
+
+err_revert:
+       /*
+        * This is called in error unwind paths. A well behaved driver should
+        * always allow us to attach to a domain that was already attached.
+        */
+       last_gdev = gdev;
+       for_each_group_device(group, gdev) {
+               const struct iommu_ops *ops = dev_iommu_ops(gdev->dev);
+
+               /*
+                * If set_platform_dma_ops is not present a NULL domain can
+                * happen only for first probe, in which case we leave
+                * group->domain as NULL and let release clean everything up.
+                */
+               if (group->domain)
+                       WARN_ON(__iommu_device_set_domain(
+                               group, gdev->dev, group->domain,
+                               IOMMU_SET_DOMAIN_MUST_SUCCEED));
+               else if (ops->set_platform_dma_ops)
+                       ops->set_platform_dma_ops(gdev->dev);
+               if (gdev == last_gdev)
+                       break;
+       }
+       return ret;
 }
 
 void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group)
@@ -3176,16 +3266,13 @@ EXPORT_SYMBOL_GPL(iommu_device_claim_dma_owner);
 
 static void __iommu_release_dma_ownership(struct iommu_group *group)
 {
-       int ret;
-
        if (WARN_ON(!group->owner_cnt || !group->owner ||
                    !xa_empty(&group->pasid_array)))
                return;
 
        group->owner_cnt = 0;
        group->owner = NULL;
-       ret = __iommu_group_set_domain(group, group->default_domain);
-       WARN(ret, "iommu driver failed to attach the default domain");
+       __iommu_group_set_domain_nofail(group, group->default_domain);
 }
 
 /**