Merge tag 'iommu-drivers-move-v5.8' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 12 Jun 2020 19:19:13 +0000 (12:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 12 Jun 2020 19:19:13 +0000 (12:19 -0700)
Pull iommu driver directory structure cleanup from Joerg Roedel:
 "Move the Intel and AMD IOMMU drivers into their own subdirectory.

  Both drivers consist of several files by now and giving them their own
  directory unclutters the IOMMU top-level directory a bit"

* tag 'iommu-drivers-move-v5.8' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu:
  iommu/vt-d: Move Intel IOMMU driver into subdirectory
  iommu/amd: Move AMD IOMMU driver into subdirectory

1  2 
MAINTAINERS
drivers/iommu/amd/iommu_v2.c
drivers/iommu/intel/svm.c

diff --cc MAINTAINERS
Simple merge
index 0000000000000000000000000000000000000000,c8a7b6b392221485c60a061e6a674ef159329a60..e4b025c5637c45bd909c6250fe02eb5f12b48790
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,981 +1,981 @@@
 -      down_read(&mm->mmap_sem);
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+  * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
+  * Author: Joerg Roedel <jroedel@suse.de>
+  */
+ #define pr_fmt(fmt)     "AMD-Vi: " fmt
+ #include <linux/mmu_notifier.h>
+ #include <linux/amd-iommu.h>
+ #include <linux/mm_types.h>
+ #include <linux/profile.h>
+ #include <linux/module.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
+ #include <linux/wait.h>
+ #include <linux/pci.h>
+ #include <linux/gfp.h>
+ #include "amd_iommu.h"
+ MODULE_LICENSE("GPL v2");
+ MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>");
+ #define MAX_DEVICES           0x10000
+ #define PRI_QUEUE_SIZE                512
+ struct pri_queue {
+       atomic_t inflight;
+       bool finish;
+       int status;
+ };
+ struct pasid_state {
+       struct list_head list;                  /* For global state-list */
+       atomic_t count;                         /* Reference count */
+       unsigned mmu_notifier_count;            /* Counting nested mmu_notifier
+                                                  calls */
+       struct mm_struct *mm;                   /* mm_struct for the faults */
+       struct mmu_notifier mn;                 /* mmu_notifier handle */
+       struct pri_queue pri[PRI_QUEUE_SIZE];   /* PRI tag states */
+       struct device_state *device_state;      /* Link to our device_state */
+       int pasid;                              /* PASID index */
+       bool invalid;                           /* Used during setup and
+                                                  teardown of the pasid */
+       spinlock_t lock;                        /* Protect pri_queues and
+                                                  mmu_notifer_count */
+       wait_queue_head_t wq;                   /* To wait for count == 0 */
+ };
+ struct device_state {
+       struct list_head list;
+       u16 devid;
+       atomic_t count;
+       struct pci_dev *pdev;
+       struct pasid_state **states;
+       struct iommu_domain *domain;
+       int pasid_levels;
+       int max_pasids;
+       amd_iommu_invalid_ppr_cb inv_ppr_cb;
+       amd_iommu_invalidate_ctx inv_ctx_cb;
+       spinlock_t lock;
+       wait_queue_head_t wq;
+ };
+ struct fault {
+       struct work_struct work;
+       struct device_state *dev_state;
+       struct pasid_state *state;
+       struct mm_struct *mm;
+       u64 address;
+       u16 devid;
+       u16 pasid;
+       u16 tag;
+       u16 finish;
+       u16 flags;
+ };
+ static LIST_HEAD(state_list);
+ static spinlock_t state_lock;
+ static struct workqueue_struct *iommu_wq;
+ static void free_pasid_states(struct device_state *dev_state);
+ static u16 device_id(struct pci_dev *pdev)
+ {
+       u16 devid;
+       devid = pdev->bus->number;
+       devid = (devid << 8) | pdev->devfn;
+       return devid;
+ }
+ static struct device_state *__get_device_state(u16 devid)
+ {
+       struct device_state *dev_state;
+       list_for_each_entry(dev_state, &state_list, list) {
+               if (dev_state->devid == devid)
+                       return dev_state;
+       }
+       return NULL;
+ }
+ static struct device_state *get_device_state(u16 devid)
+ {
+       struct device_state *dev_state;
+       unsigned long flags;
+       spin_lock_irqsave(&state_lock, flags);
+       dev_state = __get_device_state(devid);
+       if (dev_state != NULL)
+               atomic_inc(&dev_state->count);
+       spin_unlock_irqrestore(&state_lock, flags);
+       return dev_state;
+ }
+ static void free_device_state(struct device_state *dev_state)
+ {
+       struct iommu_group *group;
+       /*
+        * First detach device from domain - No more PRI requests will arrive
+        * from that device after it is unbound from the IOMMUv2 domain.
+        */
+       group = iommu_group_get(&dev_state->pdev->dev);
+       if (WARN_ON(!group))
+               return;
+       iommu_detach_group(dev_state->domain, group);
+       iommu_group_put(group);
+       /* Everything is down now, free the IOMMUv2 domain */
+       iommu_domain_free(dev_state->domain);
+       /* Finally get rid of the device-state */
+       kfree(dev_state);
+ }
+ static void put_device_state(struct device_state *dev_state)
+ {
+       if (atomic_dec_and_test(&dev_state->count))
+               wake_up(&dev_state->wq);
+ }
+ /* Must be called under dev_state->lock */
+ static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state,
+                                                 int pasid, bool alloc)
+ {
+       struct pasid_state **root, **ptr;
+       int level, index;
+       level = dev_state->pasid_levels;
+       root  = dev_state->states;
+       while (true) {
+               index = (pasid >> (9 * level)) & 0x1ff;
+               ptr   = &root[index];
+               if (level == 0)
+                       break;
+               if (*ptr == NULL) {
+                       if (!alloc)
+                               return NULL;
+                       *ptr = (void *)get_zeroed_page(GFP_ATOMIC);
+                       if (*ptr == NULL)
+                               return NULL;
+               }
+               root   = (struct pasid_state **)*ptr;
+               level -= 1;
+       }
+       return ptr;
+ }
+ static int set_pasid_state(struct device_state *dev_state,
+                          struct pasid_state *pasid_state,
+                          int pasid)
+ {
+       struct pasid_state **ptr;
+       unsigned long flags;
+       int ret;
+       spin_lock_irqsave(&dev_state->lock, flags);
+       ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+       ret = -ENOMEM;
+       if (ptr == NULL)
+               goto out_unlock;
+       ret = -ENOMEM;
+       if (*ptr != NULL)
+               goto out_unlock;
+       *ptr = pasid_state;
+       ret = 0;
+ out_unlock:
+       spin_unlock_irqrestore(&dev_state->lock, flags);
+       return ret;
+ }
+ static void clear_pasid_state(struct device_state *dev_state, int pasid)
+ {
+       struct pasid_state **ptr;
+       unsigned long flags;
+       spin_lock_irqsave(&dev_state->lock, flags);
+       ptr = __get_pasid_state_ptr(dev_state, pasid, true);
+       if (ptr == NULL)
+               goto out_unlock;
+       *ptr = NULL;
+ out_unlock:
+       spin_unlock_irqrestore(&dev_state->lock, flags);
+ }
+ static struct pasid_state *get_pasid_state(struct device_state *dev_state,
+                                          int pasid)
+ {
+       struct pasid_state **ptr, *ret = NULL;
+       unsigned long flags;
+       spin_lock_irqsave(&dev_state->lock, flags);
+       ptr = __get_pasid_state_ptr(dev_state, pasid, false);
+       if (ptr == NULL)
+               goto out_unlock;
+       ret = *ptr;
+       if (ret)
+               atomic_inc(&ret->count);
+ out_unlock:
+       spin_unlock_irqrestore(&dev_state->lock, flags);
+       return ret;
+ }
+ static void free_pasid_state(struct pasid_state *pasid_state)
+ {
+       kfree(pasid_state);
+ }
+ static void put_pasid_state(struct pasid_state *pasid_state)
+ {
+       if (atomic_dec_and_test(&pasid_state->count))
+               wake_up(&pasid_state->wq);
+ }
+ static void put_pasid_state_wait(struct pasid_state *pasid_state)
+ {
+       atomic_dec(&pasid_state->count);
+       wait_event(pasid_state->wq, !atomic_read(&pasid_state->count));
+       free_pasid_state(pasid_state);
+ }
+ static void unbind_pasid(struct pasid_state *pasid_state)
+ {
+       struct iommu_domain *domain;
+       domain = pasid_state->device_state->domain;
+       /*
+        * Mark pasid_state as invalid, no more faults will we added to the
+        * work queue after this is visible everywhere.
+        */
+       pasid_state->invalid = true;
+       /* Make sure this is visible */
+       smp_wmb();
+       /* After this the device/pasid can't access the mm anymore */
+       amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid);
+       /* Make sure no more pending faults are in the queue */
+       flush_workqueue(iommu_wq);
+ }
+ static void free_pasid_states_level1(struct pasid_state **tbl)
+ {
+       int i;
+       for (i = 0; i < 512; ++i) {
+               if (tbl[i] == NULL)
+                       continue;
+               free_page((unsigned long)tbl[i]);
+       }
+ }
+ static void free_pasid_states_level2(struct pasid_state **tbl)
+ {
+       struct pasid_state **ptr;
+       int i;
+       for (i = 0; i < 512; ++i) {
+               if (tbl[i] == NULL)
+                       continue;
+               ptr = (struct pasid_state **)tbl[i];
+               free_pasid_states_level1(ptr);
+       }
+ }
+ static void free_pasid_states(struct device_state *dev_state)
+ {
+       struct pasid_state *pasid_state;
+       int i;
+       for (i = 0; i < dev_state->max_pasids; ++i) {
+               pasid_state = get_pasid_state(dev_state, i);
+               if (pasid_state == NULL)
+                       continue;
+               put_pasid_state(pasid_state);
+               /*
+                * This will call the mn_release function and
+                * unbind the PASID
+                */
+               mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+               put_pasid_state_wait(pasid_state); /* Reference taken in
+                                                     amd_iommu_bind_pasid */
+               /* Drop reference taken in amd_iommu_bind_pasid */
+               put_device_state(dev_state);
+       }
+       if (dev_state->pasid_levels == 2)
+               free_pasid_states_level2(dev_state->states);
+       else if (dev_state->pasid_levels == 1)
+               free_pasid_states_level1(dev_state->states);
+       else
+               BUG_ON(dev_state->pasid_levels != 0);
+       free_page((unsigned long)dev_state->states);
+ }
+ static struct pasid_state *mn_to_state(struct mmu_notifier *mn)
+ {
+       return container_of(mn, struct pasid_state, mn);
+ }
+ static void mn_invalidate_range(struct mmu_notifier *mn,
+                               struct mm_struct *mm,
+                               unsigned long start, unsigned long end)
+ {
+       struct pasid_state *pasid_state;
+       struct device_state *dev_state;
+       pasid_state = mn_to_state(mn);
+       dev_state   = pasid_state->device_state;
+       if ((start ^ (end - 1)) < PAGE_SIZE)
+               amd_iommu_flush_page(dev_state->domain, pasid_state->pasid,
+                                    start);
+       else
+               amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid);
+ }
+ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
+ {
+       struct pasid_state *pasid_state;
+       struct device_state *dev_state;
+       bool run_inv_ctx_cb;
+       might_sleep();
+       pasid_state    = mn_to_state(mn);
+       dev_state      = pasid_state->device_state;
+       run_inv_ctx_cb = !pasid_state->invalid;
+       if (run_inv_ctx_cb && dev_state->inv_ctx_cb)
+               dev_state->inv_ctx_cb(dev_state->pdev, pasid_state->pasid);
+       unbind_pasid(pasid_state);
+ }
+ static const struct mmu_notifier_ops iommu_mn = {
+       .release                = mn_release,
+       .invalidate_range       = mn_invalidate_range,
+ };
+ static void set_pri_tag_status(struct pasid_state *pasid_state,
+                              u16 tag, int status)
+ {
+       unsigned long flags;
+       spin_lock_irqsave(&pasid_state->lock, flags);
+       pasid_state->pri[tag].status = status;
+       spin_unlock_irqrestore(&pasid_state->lock, flags);
+ }
+ static void finish_pri_tag(struct device_state *dev_state,
+                          struct pasid_state *pasid_state,
+                          u16 tag)
+ {
+       unsigned long flags;
+       spin_lock_irqsave(&pasid_state->lock, flags);
+       if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) &&
+           pasid_state->pri[tag].finish) {
+               amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid,
+                                      pasid_state->pri[tag].status, tag);
+               pasid_state->pri[tag].finish = false;
+               pasid_state->pri[tag].status = PPR_SUCCESS;
+       }
+       spin_unlock_irqrestore(&pasid_state->lock, flags);
+ }
+ static void handle_fault_error(struct fault *fault)
+ {
+       int status;
+       if (!fault->dev_state->inv_ppr_cb) {
+               set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+               return;
+       }
+       status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev,
+                                             fault->pasid,
+                                             fault->address,
+                                             fault->flags);
+       switch (status) {
+       case AMD_IOMMU_INV_PRI_RSP_SUCCESS:
+               set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS);
+               break;
+       case AMD_IOMMU_INV_PRI_RSP_INVALID:
+               set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
+               break;
+       case AMD_IOMMU_INV_PRI_RSP_FAIL:
+               set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE);
+               break;
+       default:
+               BUG();
+       }
+ }
+ static bool access_error(struct vm_area_struct *vma, struct fault *fault)
+ {
+       unsigned long requested = 0;
+       if (fault->flags & PPR_FAULT_EXEC)
+               requested |= VM_EXEC;
+       if (fault->flags & PPR_FAULT_READ)
+               requested |= VM_READ;
+       if (fault->flags & PPR_FAULT_WRITE)
+               requested |= VM_WRITE;
+       return (requested & ~vma->vm_flags) != 0;
+ }
+ static void do_fault(struct work_struct *work)
+ {
+       struct fault *fault = container_of(work, struct fault, work);
+       struct vm_area_struct *vma;
+       vm_fault_t ret = VM_FAULT_ERROR;
+       unsigned int flags = 0;
+       struct mm_struct *mm;
+       u64 address;
+       mm = fault->state->mm;
+       address = fault->address;
+       if (fault->flags & PPR_FAULT_USER)
+               flags |= FAULT_FLAG_USER;
+       if (fault->flags & PPR_FAULT_WRITE)
+               flags |= FAULT_FLAG_WRITE;
+       flags |= FAULT_FLAG_REMOTE;
 -      up_read(&mm->mmap_sem);
++      mmap_read_lock(mm);
+       vma = find_extend_vma(mm, address);
+       if (!vma || address < vma->vm_start)
+               /* failed to get a vma in the right range */
+               goto out;
+       /* Check if we have the right permissions on the vma */
+       if (access_error(vma, fault))
+               goto out;
+       ret = handle_mm_fault(vma, address, flags);
+ out:
++      mmap_read_unlock(mm);
+       if (ret & VM_FAULT_ERROR)
+               /* failed to service fault */
+               handle_fault_error(fault);
+       finish_pri_tag(fault->dev_state, fault->state, fault->tag);
+       put_pasid_state(fault->state);
+       kfree(fault);
+ }
+ static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data)
+ {
+       struct amd_iommu_fault *iommu_fault;
+       struct pasid_state *pasid_state;
+       struct device_state *dev_state;
+       struct pci_dev *pdev = NULL;
+       unsigned long flags;
+       struct fault *fault;
+       bool finish;
+       u16 tag, devid;
+       int ret;
+       iommu_fault = data;
+       tag         = iommu_fault->tag & 0x1ff;
+       finish      = (iommu_fault->tag >> 9) & 1;
+       devid = iommu_fault->device_id;
+       pdev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid),
+                                          devid & 0xff);
+       if (!pdev)
+               return -ENODEV;
+       ret = NOTIFY_DONE;
+       /* In kdump kernel pci dev is not initialized yet -> send INVALID */
+       if (amd_iommu_is_attach_deferred(NULL, &pdev->dev)) {
+               amd_iommu_complete_ppr(pdev, iommu_fault->pasid,
+                                      PPR_INVALID, tag);
+               goto out;
+       }
+       dev_state = get_device_state(iommu_fault->device_id);
+       if (dev_state == NULL)
+               goto out;
+       pasid_state = get_pasid_state(dev_state, iommu_fault->pasid);
+       if (pasid_state == NULL || pasid_state->invalid) {
+               /* We know the device but not the PASID -> send INVALID */
+               amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid,
+                                      PPR_INVALID, tag);
+               goto out_drop_state;
+       }
+       spin_lock_irqsave(&pasid_state->lock, flags);
+       atomic_inc(&pasid_state->pri[tag].inflight);
+       if (finish)
+               pasid_state->pri[tag].finish = true;
+       spin_unlock_irqrestore(&pasid_state->lock, flags);
+       fault = kzalloc(sizeof(*fault), GFP_ATOMIC);
+       if (fault == NULL) {
+               /* We are OOM - send success and let the device re-fault */
+               finish_pri_tag(dev_state, pasid_state, tag);
+               goto out_drop_state;
+       }
+       fault->dev_state = dev_state;
+       fault->address   = iommu_fault->address;
+       fault->state     = pasid_state;
+       fault->tag       = tag;
+       fault->finish    = finish;
+       fault->pasid     = iommu_fault->pasid;
+       fault->flags     = iommu_fault->flags;
+       INIT_WORK(&fault->work, do_fault);
+       queue_work(iommu_wq, &fault->work);
+       ret = NOTIFY_OK;
+ out_drop_state:
+       if (ret != NOTIFY_OK && pasid_state)
+               put_pasid_state(pasid_state);
+       put_device_state(dev_state);
+ out:
+       return ret;
+ }
+ static struct notifier_block ppr_nb = {
+       .notifier_call = ppr_notifier,
+ };
+ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
+                        struct task_struct *task)
+ {
+       struct pasid_state *pasid_state;
+       struct device_state *dev_state;
+       struct mm_struct *mm;
+       u16 devid;
+       int ret;
+       might_sleep();
+       if (!amd_iommu_v2_supported())
+               return -ENODEV;
+       devid     = device_id(pdev);
+       dev_state = get_device_state(devid);
+       if (dev_state == NULL)
+               return -EINVAL;
+       ret = -EINVAL;
+       if (pasid < 0 || pasid >= dev_state->max_pasids)
+               goto out;
+       ret = -ENOMEM;
+       pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL);
+       if (pasid_state == NULL)
+               goto out;
+       atomic_set(&pasid_state->count, 1);
+       init_waitqueue_head(&pasid_state->wq);
+       spin_lock_init(&pasid_state->lock);
+       mm                        = get_task_mm(task);
+       pasid_state->mm           = mm;
+       pasid_state->device_state = dev_state;
+       pasid_state->pasid        = pasid;
+       pasid_state->invalid      = true; /* Mark as valid only if we are
+                                            done with setting up the pasid */
+       pasid_state->mn.ops       = &iommu_mn;
+       if (pasid_state->mm == NULL)
+               goto out_free;
+       mmu_notifier_register(&pasid_state->mn, mm);
+       ret = set_pasid_state(dev_state, pasid_state, pasid);
+       if (ret)
+               goto out_unregister;
+       ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid,
+                                       __pa(pasid_state->mm->pgd));
+       if (ret)
+               goto out_clear_state;
+       /* Now we are ready to handle faults */
+       pasid_state->invalid = false;
+       /*
+        * Drop the reference to the mm_struct here. We rely on the
+        * mmu_notifier release call-back to inform us when the mm
+        * is going away.
+        */
+       mmput(mm);
+       return 0;
+ out_clear_state:
+       clear_pasid_state(dev_state, pasid);
+ out_unregister:
+       mmu_notifier_unregister(&pasid_state->mn, mm);
+       mmput(mm);
+ out_free:
+       free_pasid_state(pasid_state);
+ out:
+       put_device_state(dev_state);
+       return ret;
+ }
+ EXPORT_SYMBOL(amd_iommu_bind_pasid);
+ void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
+ {
+       struct pasid_state *pasid_state;
+       struct device_state *dev_state;
+       u16 devid;
+       might_sleep();
+       if (!amd_iommu_v2_supported())
+               return;
+       devid = device_id(pdev);
+       dev_state = get_device_state(devid);
+       if (dev_state == NULL)
+               return;
+       if (pasid < 0 || pasid >= dev_state->max_pasids)
+               goto out;
+       pasid_state = get_pasid_state(dev_state, pasid);
+       if (pasid_state == NULL)
+               goto out;
+       /*
+        * Drop reference taken here. We are safe because we still hold
+        * the reference taken in the amd_iommu_bind_pasid function.
+        */
+       put_pasid_state(pasid_state);
+       /* Clear the pasid state so that the pasid can be re-used */
+       clear_pasid_state(dev_state, pasid_state->pasid);
+       /*
+        * Call mmu_notifier_unregister to drop our reference
+        * to pasid_state->mm
+        */
+       mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
+       put_pasid_state_wait(pasid_state); /* Reference taken in
+                                             amd_iommu_bind_pasid */
+ out:
+       /* Drop reference taken in this function */
+       put_device_state(dev_state);
+       /* Drop reference taken in amd_iommu_bind_pasid */
+       put_device_state(dev_state);
+ }
+ EXPORT_SYMBOL(amd_iommu_unbind_pasid);
+ int amd_iommu_init_device(struct pci_dev *pdev, int pasids)
+ {
+       struct device_state *dev_state;
+       struct iommu_group *group;
+       unsigned long flags;
+       int ret, tmp;
+       u16 devid;
+       might_sleep();
+       if (!amd_iommu_v2_supported())
+               return -ENODEV;
+       if (pasids <= 0 || pasids > (PASID_MASK + 1))
+               return -EINVAL;
+       devid = device_id(pdev);
+       dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL);
+       if (dev_state == NULL)
+               return -ENOMEM;
+       spin_lock_init(&dev_state->lock);
+       init_waitqueue_head(&dev_state->wq);
+       dev_state->pdev  = pdev;
+       dev_state->devid = devid;
+       tmp = pasids;
+       for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9)
+               dev_state->pasid_levels += 1;
+       atomic_set(&dev_state->count, 1);
+       dev_state->max_pasids = pasids;
+       ret = -ENOMEM;
+       dev_state->states = (void *)get_zeroed_page(GFP_KERNEL);
+       if (dev_state->states == NULL)
+               goto out_free_dev_state;
+       dev_state->domain = iommu_domain_alloc(&pci_bus_type);
+       if (dev_state->domain == NULL)
+               goto out_free_states;
+       amd_iommu_domain_direct_map(dev_state->domain);
+       ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids);
+       if (ret)
+               goto out_free_domain;
+       group = iommu_group_get(&pdev->dev);
+       if (!group) {
+               ret = -EINVAL;
+               goto out_free_domain;
+       }
+       ret = iommu_attach_group(dev_state->domain, group);
+       if (ret != 0)
+               goto out_drop_group;
+       iommu_group_put(group);
+       spin_lock_irqsave(&state_lock, flags);
+       if (__get_device_state(devid) != NULL) {
+               spin_unlock_irqrestore(&state_lock, flags);
+               ret = -EBUSY;
+               goto out_free_domain;
+       }
+       list_add_tail(&dev_state->list, &state_list);
+       spin_unlock_irqrestore(&state_lock, flags);
+       return 0;
+ out_drop_group:
+       iommu_group_put(group);
+ out_free_domain:
+       iommu_domain_free(dev_state->domain);
+ out_free_states:
+       free_page((unsigned long)dev_state->states);
+ out_free_dev_state:
+       kfree(dev_state);
+       return ret;
+ }
+ EXPORT_SYMBOL(amd_iommu_init_device);
+ void amd_iommu_free_device(struct pci_dev *pdev)
+ {
+       struct device_state *dev_state;
+       unsigned long flags;
+       u16 devid;
+       if (!amd_iommu_v2_supported())
+               return;
+       devid = device_id(pdev);
+       spin_lock_irqsave(&state_lock, flags);
+       dev_state = __get_device_state(devid);
+       if (dev_state == NULL) {
+               spin_unlock_irqrestore(&state_lock, flags);
+               return;
+       }
+       list_del(&dev_state->list);
+       spin_unlock_irqrestore(&state_lock, flags);
+       /* Get rid of any remaining pasid states */
+       free_pasid_states(dev_state);
+       put_device_state(dev_state);
+       /*
+        * Wait until the last reference is dropped before freeing
+        * the device state.
+        */
+       wait_event(dev_state->wq, !atomic_read(&dev_state->count));
+       free_device_state(dev_state);
+ }
+ EXPORT_SYMBOL(amd_iommu_free_device);
+ int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
+                                amd_iommu_invalid_ppr_cb cb)
+ {
+       struct device_state *dev_state;
+       unsigned long flags;
+       u16 devid;
+       int ret;
+       if (!amd_iommu_v2_supported())
+               return -ENODEV;
+       devid = device_id(pdev);
+       spin_lock_irqsave(&state_lock, flags);
+       ret = -EINVAL;
+       dev_state = __get_device_state(devid);
+       if (dev_state == NULL)
+               goto out_unlock;
+       dev_state->inv_ppr_cb = cb;
+       ret = 0;
+ out_unlock:
+       spin_unlock_irqrestore(&state_lock, flags);
+       return ret;
+ }
+ EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb);
+ int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
+                                   amd_iommu_invalidate_ctx cb)
+ {
+       struct device_state *dev_state;
+       unsigned long flags;
+       u16 devid;
+       int ret;
+       if (!amd_iommu_v2_supported())
+               return -ENODEV;
+       devid = device_id(pdev);
+       spin_lock_irqsave(&state_lock, flags);
+       ret = -EINVAL;
+       dev_state = __get_device_state(devid);
+       if (dev_state == NULL)
+               goto out_unlock;
+       dev_state->inv_ctx_cb = cb;
+       ret = 0;
+ out_unlock:
+       spin_unlock_irqrestore(&state_lock, flags);
+       return ret;
+ }
+ EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb);
+ static int __init amd_iommu_v2_init(void)
+ {
+       int ret;
+       pr_info("AMD IOMMUv2 driver by Joerg Roedel <jroedel@suse.de>\n");
+       if (!amd_iommu_v2_supported()) {
+               pr_info("AMD IOMMUv2 functionality not available on this system\n");
+               /*
+                * Load anyway to provide the symbols to other modules
+                * which may use AMD IOMMUv2 optionally.
+                */
+               return 0;
+       }
+       spin_lock_init(&state_lock);
+       ret = -ENOMEM;
+       iommu_wq = alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM, 0);
+       if (iommu_wq == NULL)
+               goto out;
+       amd_iommu_register_ppr_notifier(&ppr_nb);
+       return 0;
+ out:
+       return ret;
+ }
+ static void __exit amd_iommu_v2_exit(void)
+ {
+       struct device_state *dev_state;
+       int i;
+       if (!amd_iommu_v2_supported())
+               return;
+       amd_iommu_unregister_ppr_notifier(&ppr_nb);
+       flush_workqueue(iommu_wq);
+       /*
+        * The loop below might call flush_workqueue(), so call
+        * destroy_workqueue() after it
+        */
+       for (i = 0; i < MAX_DEVICES; ++i) {
+               dev_state = get_device_state(i);
+               if (dev_state == NULL)
+                       continue;
+               WARN_ON_ONCE(1);
+               put_device_state(dev_state);
+               amd_iommu_free_device(dev_state->pdev);
+       }
+       destroy_workqueue(iommu_wq);
+ }
+ module_init(amd_iommu_v2_init);
+ module_exit(amd_iommu_v2_exit);
index 0000000000000000000000000000000000000000,a035ef911fba789dd206cbadd1cbdac96cf3d217..6c87c807a0abb8e3527a996d3d9ba88fd52456fe
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1002 +1,1002 @@@
 -              down_read(&svm->mm->mmap_sem);
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+  * Copyright Â© 2015 Intel Corporation.
+  *
+  * Authors: David Woodhouse <dwmw2@infradead.org>
+  */
+ #include <linux/intel-iommu.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
+ #include <linux/slab.h>
+ #include <linux/intel-svm.h>
+ #include <linux/rculist.h>
+ #include <linux/pci.h>
+ #include <linux/pci-ats.h>
+ #include <linux/dmar.h>
+ #include <linux/interrupt.h>
+ #include <linux/mm_types.h>
+ #include <linux/ioasid.h>
+ #include <asm/page.h>
+ #include "intel-pasid.h"
+ static irqreturn_t prq_event_thread(int irq, void *d);
+ static void intel_svm_drain_prq(struct device *dev, int pasid);
+ #define PRQ_ORDER 0
+ int intel_svm_enable_prq(struct intel_iommu *iommu)
+ {
+       struct page *pages;
+       int irq, ret;
+       pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
+       if (!pages) {
+               pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
+                       iommu->name);
+               return -ENOMEM;
+       }
+       iommu->prq = page_address(pages);
+       irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
+       if (irq <= 0) {
+               pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
+                      iommu->name);
+               ret = -EINVAL;
+       err:
+               free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+               iommu->prq = NULL;
+               return ret;
+       }
+       iommu->pr_irq = irq;
+       snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
+       ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
+                                  iommu->prq_name, iommu);
+       if (ret) {
+               pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
+                      iommu->name);
+               dmar_free_hwirq(irq);
+               iommu->pr_irq = 0;
+               goto err;
+       }
+       dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+       dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+       dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
+       init_completion(&iommu->prq_complete);
+       return 0;
+ }
+ int intel_svm_finish_prq(struct intel_iommu *iommu)
+ {
+       dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+       dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+       dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
+       if (iommu->pr_irq) {
+               free_irq(iommu->pr_irq, iommu);
+               dmar_free_hwirq(iommu->pr_irq);
+               iommu->pr_irq = 0;
+       }
+       free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+       iommu->prq = NULL;
+       return 0;
+ }
+ static inline bool intel_svm_capable(struct intel_iommu *iommu)
+ {
+       return iommu->flags & VTD_FLAG_SVM_CAPABLE;
+ }
+ void intel_svm_check(struct intel_iommu *iommu)
+ {
+       if (!pasid_supported(iommu))
+               return;
+       if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
+           !cap_fl1gp_support(iommu->cap)) {
+               pr_err("%s SVM disabled, incompatible 1GB page capability\n",
+                      iommu->name);
+               return;
+       }
+       if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+           !cap_5lp_support(iommu->cap)) {
+               pr_err("%s SVM disabled, incompatible paging mode\n",
+                      iommu->name);
+               return;
+       }
+       iommu->flags |= VTD_FLAG_SVM_CAPABLE;
+ }
+ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
+                               unsigned long address, unsigned long pages, int ih)
+ {
+       struct qi_desc desc;
+       if (pages == -1) {
+               desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
+                       QI_EIOTLB_DID(sdev->did) |
+                       QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+                       QI_EIOTLB_TYPE;
+               desc.qw1 = 0;
+       } else {
+               int mask = ilog2(__roundup_pow_of_two(pages));
+               desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
+                               QI_EIOTLB_DID(sdev->did) |
+                               QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
+                               QI_EIOTLB_TYPE;
+               desc.qw1 = QI_EIOTLB_ADDR(address) |
+                               QI_EIOTLB_IH(ih) |
+                               QI_EIOTLB_AM(mask);
+       }
+       desc.qw2 = 0;
+       desc.qw3 = 0;
+       qi_submit_sync(svm->iommu, &desc, 1, 0);
+       if (sdev->dev_iotlb) {
+               desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
+                               QI_DEV_EIOTLB_SID(sdev->sid) |
+                               QI_DEV_EIOTLB_QDEP(sdev->qdep) |
+                               QI_DEIOTLB_TYPE;
+               if (pages == -1) {
+                       desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
+                                       QI_DEV_EIOTLB_SIZE;
+               } else if (pages > 1) {
+                       /* The least significant zero bit indicates the size. So,
+                        * for example, an "address" value of 0x12345f000 will
+                        * flush from 0x123440000 to 0x12347ffff (256KiB). */
+                       unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
+                       unsigned long mask = __rounddown_pow_of_two(address ^ last);
+                       desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
+                                       (mask - 1)) | QI_DEV_EIOTLB_SIZE;
+               } else {
+                       desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
+               }
+               desc.qw2 = 0;
+               desc.qw3 = 0;
+               qi_submit_sync(svm->iommu, &desc, 1, 0);
+       }
+ }
+ static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
+                               unsigned long pages, int ih)
+ {
+       struct intel_svm_dev *sdev;
+       rcu_read_lock();
+       list_for_each_entry_rcu(sdev, &svm->devs, list)
+               intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
+       rcu_read_unlock();
+ }
+ /* Pages have been freed at this point */
+ static void intel_invalidate_range(struct mmu_notifier *mn,
+                                  struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+ {
+       struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+       intel_flush_svm_range(svm, start,
+                             (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
+ }
+ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+ {
+       struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+       struct intel_svm_dev *sdev;
+       /* This might end up being called from exit_mmap(), *before* the page
+        * tables are cleared. And __mmu_notifier_release() will delete us from
+        * the list of notifiers so that our invalidate_range() callback doesn't
+        * get called when the page tables are cleared. So we need to protect
+        * against hardware accessing those page tables.
+        *
+        * We do it by clearing the entry in the PASID table and then flushing
+        * the IOTLB and the PASID table caches. This might upset hardware;
+        * perhaps we'll want to point the PASID to a dummy PGD (like the zero
+        * page) so that we end up taking a fault that the hardware really
+        * *has* to handle gracefully without affecting other processes.
+        */
+       rcu_read_lock();
+       list_for_each_entry_rcu(sdev, &svm->devs, list)
+               intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
+                                           svm->pasid, true);
+       rcu_read_unlock();
+ }
+ static const struct mmu_notifier_ops intel_mmuops = {
+       .release = intel_mm_release,
+       .invalidate_range = intel_invalidate_range,
+ };
+ static DEFINE_MUTEX(pasid_mutex);
+ static LIST_HEAD(global_svm_list);
+ #define for_each_svm_dev(sdev, svm, d)                        \
+       list_for_each_entry((sdev), &(svm)->devs, list) \
+               if ((d) != (sdev)->dev) {} else
+ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+                         struct iommu_gpasid_bind_data *data)
+ {
+       struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+       struct dmar_domain *dmar_domain;
+       struct intel_svm_dev *sdev;
+       struct intel_svm *svm;
+       int ret = 0;
+       if (WARN_ON(!iommu) || !data)
+               return -EINVAL;
+       if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
+           data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
+               return -EINVAL;
+       if (!dev_is_pci(dev))
+               return -ENOTSUPP;
+       /* VT-d supports devices with full 20 bit PASIDs only */
+       if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
+               return -EINVAL;
+       /*
+        * We only check host PASID range, we have no knowledge to check
+        * guest PASID range.
+        */
+       if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
+               return -EINVAL;
+       dmar_domain = to_dmar_domain(domain);
+       mutex_lock(&pasid_mutex);
+       svm = ioasid_find(NULL, data->hpasid, NULL);
+       if (IS_ERR(svm)) {
+               ret = PTR_ERR(svm);
+               goto out;
+       }
+       if (svm) {
+               /*
+                * If we found svm for the PASID, there must be at
+                * least one device bond, otherwise svm should be freed.
+                */
+               if (WARN_ON(list_empty(&svm->devs))) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               for_each_svm_dev(sdev, svm, dev) {
+                       /*
+                        * For devices with aux domains, we should allow
+                        * multiple bind calls with the same PASID and pdev.
+                        */
+                       if (iommu_dev_feature_enabled(dev,
+                                                     IOMMU_DEV_FEAT_AUX)) {
+                               sdev->users++;
+                       } else {
+                               dev_warn_ratelimited(dev,
+                                                    "Already bound with PASID %u\n",
+                                                    svm->pasid);
+                               ret = -EBUSY;
+                       }
+                       goto out;
+               }
+       } else {
+               /* We come here when PASID has never been bond to a device. */
+               svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+               if (!svm) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               /* REVISIT: upper layer/VFIO can track host process that bind
+                * the PASID. ioasid_set = mm might be sufficient for vfio to
+                * check pasid VMM ownership. We can drop the following line
+                * once VFIO and IOASID set check is in place.
+                */
+               svm->mm = get_task_mm(current);
+               svm->pasid = data->hpasid;
+               if (data->flags & IOMMU_SVA_GPASID_VAL) {
+                       svm->gpasid = data->gpasid;
+                       svm->flags |= SVM_FLAG_GUEST_PASID;
+               }
+               ioasid_set_data(data->hpasid, svm);
+               INIT_LIST_HEAD_RCU(&svm->devs);
+               mmput(svm->mm);
+       }
+       sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+       if (!sdev) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       sdev->dev = dev;
+       /* Only count users if device has aux domains */
+       if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+               sdev->users = 1;
+       /* Set up device context entry for PASID if not enabled already */
+       ret = intel_iommu_enable_pasid(iommu, sdev->dev);
+       if (ret) {
+               dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
+               kfree(sdev);
+               goto out;
+       }
+       /*
+        * PASID table is per device for better security. Therefore, for
+        * each bind of a new device even with an existing PASID, we need to
+        * call the nested mode setup function here.
+        */
+       spin_lock(&iommu->lock);
+       ret = intel_pasid_setup_nested(iommu, dev,
+                                      (pgd_t *)(uintptr_t)data->gpgd,
+                                      data->hpasid, &data->vtd, dmar_domain,
+                                      data->addr_width);
+       spin_unlock(&iommu->lock);
+       if (ret) {
+               dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
+                                   data->hpasid, ret);
+               /*
+                * PASID entry should be in cleared state if nested mode
+                * set up failed. So we only need to clear IOASID tracking
+                * data such that free call will succeed.
+                */
+               kfree(sdev);
+               goto out;
+       }
+       svm->flags |= SVM_FLAG_GUEST_MODE;
+       init_rcu_head(&sdev->rcu);
+       list_add_rcu(&sdev->list, &svm->devs);
+  out:
+       if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
+               ioasid_set_data(data->hpasid, NULL);
+               kfree(svm);
+       }
+       mutex_unlock(&pasid_mutex);
+       return ret;
+ }
+ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
+ {
+       struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+       struct intel_svm_dev *sdev;
+       struct intel_svm *svm;
+       int ret = -EINVAL;
+       if (WARN_ON(!iommu))
+               return -EINVAL;
+       mutex_lock(&pasid_mutex);
+       svm = ioasid_find(NULL, pasid, NULL);
+       if (!svm) {
+               ret = -EINVAL;
+               goto out;
+       }
+       if (IS_ERR(svm)) {
+               ret = PTR_ERR(svm);
+               goto out;
+       }
+       for_each_svm_dev(sdev, svm, dev) {
+               ret = 0;
+               if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+                       sdev->users--;
+               if (!sdev->users) {
+                       list_del_rcu(&sdev->list);
+                       intel_pasid_tear_down_entry(iommu, dev,
+                                                   svm->pasid, false);
+                       intel_svm_drain_prq(dev, svm->pasid);
+                       kfree_rcu(sdev, rcu);
+                       if (list_empty(&svm->devs)) {
+                               /*
+                                * We do not free the IOASID here in that
+                                * IOMMU driver did not allocate it.
+                                * Unlike native SVM, IOASID for guest use was
+                                * allocated prior to the bind call.
+                                * In any case, if the free call comes before
+                                * the unbind, IOMMU driver will get notified
+                                * and perform cleanup.
+                                */
+                               ioasid_set_data(pasid, NULL);
+                               kfree(svm);
+                       }
+               }
+               break;
+       }
+ out:
+       mutex_unlock(&pasid_mutex);
+       return ret;
+ }
+ /* Caller must hold pasid_mutex, mm reference */
+ static int
+ intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
+                 struct mm_struct *mm, struct intel_svm_dev **sd)
+ {
+       struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+       struct device_domain_info *info;
+       struct intel_svm_dev *sdev;
+       struct intel_svm *svm = NULL;
+       int pasid_max;
+       int ret;
+       if (!iommu || dmar_disabled)
+               return -EINVAL;
+       if (!intel_svm_capable(iommu))
+               return -ENOTSUPP;
+       if (dev_is_pci(dev)) {
+               pasid_max = pci_max_pasids(to_pci_dev(dev));
+               if (pasid_max < 0)
+                       return -EINVAL;
+       } else
+               pasid_max = 1 << 20;
+       /* Bind supervisor PASID shuld have mm = NULL */
+       if (flags & SVM_FLAG_SUPERVISOR_MODE) {
+               if (!ecap_srs(iommu->ecap) || mm) {
+                       pr_err("Supervisor PASID with user provided mm.\n");
+                       return -EINVAL;
+               }
+       }
+       if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
+               struct intel_svm *t;
+               list_for_each_entry(t, &global_svm_list, list) {
+                       if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
+                               continue;
+                       svm = t;
+                       if (svm->pasid >= pasid_max) {
+                               dev_warn(dev,
+                                        "Limited PASID width. Cannot use existing PASID %d\n",
+                                        svm->pasid);
+                               ret = -ENOSPC;
+                               goto out;
+                       }
+                       /* Find the matching device in svm list */
+                       for_each_svm_dev(sdev, svm, dev) {
+                               if (sdev->ops != ops) {
+                                       ret = -EBUSY;
+                                       goto out;
+                               }
+                               sdev->users++;
+                               goto success;
+                       }
+                       break;
+               }
+       }
+       sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+       if (!sdev) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       sdev->dev = dev;
+       ret = intel_iommu_enable_pasid(iommu, dev);
+       if (ret) {
+               kfree(sdev);
+               goto out;
+       }
+       info = get_domain_info(dev);
+       sdev->did = FLPT_DEFAULT_DID;
+       sdev->sid = PCI_DEVID(info->bus, info->devfn);
+       if (info->ats_enabled) {
+               sdev->dev_iotlb = 1;
+               sdev->qdep = info->ats_qdep;
+               if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
+                       sdev->qdep = 0;
+       }
+       /* Finish the setup now we know we're keeping it */
+       sdev->users = 1;
+       sdev->ops = ops;
+       init_rcu_head(&sdev->rcu);
+       if (!svm) {
+               svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+               if (!svm) {
+                       ret = -ENOMEM;
+                       kfree(sdev);
+                       goto out;
+               }
+               svm->iommu = iommu;
+               if (pasid_max > intel_pasid_max_id)
+                       pasid_max = intel_pasid_max_id;
+               /* Do not use PASID 0, reserved for RID to PASID */
+               svm->pasid = ioasid_alloc(NULL, PASID_MIN,
+                                         pasid_max - 1, svm);
+               if (svm->pasid == INVALID_IOASID) {
+                       kfree(svm);
+                       kfree(sdev);
+                       ret = -ENOSPC;
+                       goto out;
+               }
+               svm->notifier.ops = &intel_mmuops;
+               svm->mm = mm;
+               svm->flags = flags;
+               INIT_LIST_HEAD_RCU(&svm->devs);
+               INIT_LIST_HEAD(&svm->list);
+               ret = -ENOMEM;
+               if (mm) {
+                       ret = mmu_notifier_register(&svm->notifier, mm);
+                       if (ret) {
+                               ioasid_free(svm->pasid);
+                               kfree(svm);
+                               kfree(sdev);
+                               goto out;
+                       }
+               }
+               spin_lock(&iommu->lock);
+               ret = intel_pasid_setup_first_level(iommu, dev,
+                               mm ? mm->pgd : init_mm.pgd,
+                               svm->pasid, FLPT_DEFAULT_DID,
+                               (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+                               (cpu_feature_enabled(X86_FEATURE_LA57) ?
+                                PASID_FLAG_FL5LP : 0));
+               spin_unlock(&iommu->lock);
+               if (ret) {
+                       if (mm)
+                               mmu_notifier_unregister(&svm->notifier, mm);
+                       ioasid_free(svm->pasid);
+                       kfree(svm);
+                       kfree(sdev);
+                       goto out;
+               }
+               list_add_tail(&svm->list, &global_svm_list);
+       } else {
+               /*
+                * Binding a new device with existing PASID, need to setup
+                * the PASID entry.
+                */
+               spin_lock(&iommu->lock);
+               ret = intel_pasid_setup_first_level(iommu, dev,
+                                               mm ? mm->pgd : init_mm.pgd,
+                                               svm->pasid, FLPT_DEFAULT_DID,
+                                               (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+                                               (cpu_feature_enabled(X86_FEATURE_LA57) ?
+                                               PASID_FLAG_FL5LP : 0));
+               spin_unlock(&iommu->lock);
+               if (ret) {
+                       kfree(sdev);
+                       goto out;
+               }
+       }
+       list_add_rcu(&sdev->list, &svm->devs);
+ success:
+       sdev->pasid = svm->pasid;
+       sdev->sva.dev = dev;
+       if (sd)
+               *sd = sdev;
+       ret = 0;
+  out:
+       return ret;
+ }
+ /* Caller must hold pasid_mutex */
+ static int intel_svm_unbind_mm(struct device *dev, int pasid)
+ {
+       struct intel_svm_dev *sdev;
+       struct intel_iommu *iommu;
+       struct intel_svm *svm;
+       int ret = -EINVAL;
+       iommu = intel_svm_device_to_iommu(dev);
+       if (!iommu)
+               goto out;
+       svm = ioasid_find(NULL, pasid, NULL);
+       if (!svm)
+               goto out;
+       if (IS_ERR(svm)) {
+               ret = PTR_ERR(svm);
+               goto out;
+       }
+       for_each_svm_dev(sdev, svm, dev) {
+               ret = 0;
+               sdev->users--;
+               if (!sdev->users) {
+                       list_del_rcu(&sdev->list);
+                       /* Flush the PASID cache and IOTLB for this device.
+                        * Note that we do depend on the hardware *not* using
+                        * the PASID any more. Just as we depend on other
+                        * devices never using PASIDs that they have no right
+                        * to use. We have a *shared* PASID table, because it's
+                        * large and has to be physically contiguous. So it's
+                        * hard to be as defensive as we might like. */
+                       intel_pasid_tear_down_entry(iommu, dev,
+                                                   svm->pasid, false);
+                       intel_svm_drain_prq(dev, svm->pasid);
+                       kfree_rcu(sdev, rcu);
+                       if (list_empty(&svm->devs)) {
+                               ioasid_free(svm->pasid);
+                               if (svm->mm)
+                                       mmu_notifier_unregister(&svm->notifier, svm->mm);
+                               list_del(&svm->list);
+                               /* We mandate that no page faults may be outstanding
+                                * for the PASID when intel_svm_unbind_mm() is called.
+                                * If that is not obeyed, subtle errors will happen.
+                                * Let's make them less subtle... */
+                               memset(svm, 0x6b, sizeof(*svm));
+                               kfree(svm);
+                       }
+               }
+               break;
+       }
+  out:
+       return ret;
+ }
+ /* Page request queue descriptor */
+ struct page_req_dsc {
+       union {
+               struct {
+                       u64 type:8;
+                       u64 pasid_present:1;
+                       u64 priv_data_present:1;
+                       u64 rsvd:6;
+                       u64 rid:16;
+                       u64 pasid:20;
+                       u64 exe_req:1;
+                       u64 pm_req:1;
+                       u64 rsvd2:10;
+               };
+               u64 qw_0;
+       };
+       union {
+               struct {
+                       u64 rd_req:1;
+                       u64 wr_req:1;
+                       u64 lpig:1;
+                       u64 prg_index:9;
+                       u64 addr:52;
+               };
+               u64 qw_1;
+       };
+       u64 priv_data[2];
+ };
+ #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
+ static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
+ {
+       unsigned long requested = 0;
+       if (req->exe_req)
+               requested |= VM_EXEC;
+       if (req->rd_req)
+               requested |= VM_READ;
+       if (req->wr_req)
+               requested |= VM_WRITE;
+       return (requested & ~vma->vm_flags) != 0;
+ }
+ static bool is_canonical_address(u64 addr)
+ {
+       int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+       long saddr = (long) addr;
+       return (((saddr << shift) >> shift) == saddr);
+ }
+ /**
+  * intel_svm_drain_prq - Drain page requests and responses for a pasid
+  * @dev: target device
+  * @pasid: pasid for draining
+  *
+  * Drain all pending page requests and responses related to @pasid in both
+  * software and hardware. This is supposed to be called after the device
+  * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
+  * and DevTLB have been invalidated.
+  *
+  * It waits until all pending page requests for @pasid in the page fault
+  * queue are completed by the prq handling thread. Then follow the steps
+  * described in VT-d spec CH7.10 to drain all page requests and page
+  * responses pending in the hardware.
+  */
+ static void intel_svm_drain_prq(struct device *dev, int pasid)
+ {
+       struct device_domain_info *info;
+       struct dmar_domain *domain;
+       struct intel_iommu *iommu;
+       struct qi_desc desc[3];
+       struct pci_dev *pdev;
+       int head, tail;
+       u16 sid, did;
+       int qdep;
+       info = get_domain_info(dev);
+       if (WARN_ON(!info || !dev_is_pci(dev)))
+               return;
+       if (!info->pri_enabled)
+               return;
+       iommu = info->iommu;
+       domain = info->domain;
+       pdev = to_pci_dev(dev);
+       sid = PCI_DEVID(info->bus, info->devfn);
+       did = domain->iommu_did[iommu->seq_id];
+       qdep = pci_ats_queue_depth(pdev);
+       /*
+        * Check and wait until all pending page requests in the queue are
+        * handled by the prq handling thread.
+        */
+ prq_retry:
+       reinit_completion(&iommu->prq_complete);
+       tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+       head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+       while (head != tail) {
+               struct page_req_dsc *req;
+               req = &iommu->prq[head / sizeof(*req)];
+               if (!req->pasid_present || req->pasid != pasid) {
+                       head = (head + sizeof(*req)) & PRQ_RING_MASK;
+                       continue;
+               }
+               wait_for_completion(&iommu->prq_complete);
+               goto prq_retry;
+       }
+       /*
+        * Perform steps described in VT-d spec CH7.10 to drain page
+        * requests and responses in hardware.
+        */
+       memset(desc, 0, sizeof(desc));
+       desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+                       QI_IWD_FENCE |
+                       QI_IWD_TYPE;
+       desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
+                       QI_EIOTLB_DID(did) |
+                       QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+                       QI_EIOTLB_TYPE;
+       desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
+                       QI_DEV_EIOTLB_SID(sid) |
+                       QI_DEV_EIOTLB_QDEP(qdep) |
+                       QI_DEIOTLB_TYPE |
+                       QI_DEV_IOTLB_PFSID(info->pfsid);
+ qi_retry:
+       reinit_completion(&iommu->prq_complete);
+       qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
+       if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+               wait_for_completion(&iommu->prq_complete);
+               goto qi_retry;
+       }
+ }
+ static irqreturn_t prq_event_thread(int irq, void *d)
+ {
+       struct intel_iommu *iommu = d;
+       struct intel_svm *svm = NULL;
+       int head, tail, handled = 0;
+       /* Clear PPR bit before reading head/tail registers, to
+        * ensure that we get a new interrupt if needed. */
+       writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
+       tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+       head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+       while (head != tail) {
+               struct intel_svm_dev *sdev;
+               struct vm_area_struct *vma;
+               struct page_req_dsc *req;
+               struct qi_desc resp;
+               int result;
+               vm_fault_t ret;
+               u64 address;
+               handled = 1;
+               req = &iommu->prq[head / sizeof(*req)];
+               result = QI_RESP_FAILURE;
+               address = (u64)req->addr << VTD_PAGE_SHIFT;
+               if (!req->pasid_present) {
+                       pr_err("%s: Page request without PASID: %08llx %08llx\n",
+                              iommu->name, ((unsigned long long *)req)[0],
+                              ((unsigned long long *)req)[1]);
+                       goto no_pasid;
+               }
+               if (!svm || svm->pasid != req->pasid) {
+                       rcu_read_lock();
+                       svm = ioasid_find(NULL, req->pasid, NULL);
+                       /* It *can't* go away, because the driver is not permitted
+                        * to unbind the mm while any page faults are outstanding.
+                        * So we only need RCU to protect the internal idr code. */
+                       rcu_read_unlock();
+                       if (IS_ERR_OR_NULL(svm)) {
+                               pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
+                                      iommu->name, req->pasid, ((unsigned long long *)req)[0],
+                                      ((unsigned long long *)req)[1]);
+                               goto no_pasid;
+                       }
+               }
+               result = QI_RESP_INVALID;
+               /* Since we're using init_mm.pgd directly, we should never take
+                * any faults on kernel addresses. */
+               if (!svm->mm)
+                       goto bad_req;
+               /* If address is not canonical, return invalid response */
+               if (!is_canonical_address(address))
+                       goto bad_req;
+               /* If the mm is already defunct, don't handle faults. */
+               if (!mmget_not_zero(svm->mm))
+                       goto bad_req;
 -              up_read(&svm->mm->mmap_sem);
++              mmap_read_lock(svm->mm);
+               vma = find_extend_vma(svm->mm, address);
+               if (!vma || address < vma->vm_start)
+                       goto invalid;
+               if (access_error(vma, req))
+                       goto invalid;
+               ret = handle_mm_fault(vma, address,
+                                     req->wr_req ? FAULT_FLAG_WRITE : 0);
+               if (ret & VM_FAULT_ERROR)
+                       goto invalid;
+               result = QI_RESP_SUCCESS;
+       invalid:
++              mmap_read_unlock(svm->mm);
+               mmput(svm->mm);
+       bad_req:
+               /* Accounting for major/minor faults? */
+               rcu_read_lock();
+               list_for_each_entry_rcu(sdev, &svm->devs, list) {
+                       if (sdev->sid == req->rid)
+                               break;
+               }
+               /* Other devices can go away, but the drivers are not permitted
+                * to unbind while any page faults might be in flight. So it's
+                * OK to drop the 'lock' here now we have it. */
+               rcu_read_unlock();
+               if (WARN_ON(&sdev->list == &svm->devs))
+                       sdev = NULL;
+               if (sdev && sdev->ops && sdev->ops->fault_cb) {
+                       int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
+                               (req->exe_req << 1) | (req->pm_req);
+                       sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
+                                           req->priv_data, rwxp, result);
+               }
+               /* We get here in the error case where the PASID lookup failed,
+                  and these can be NULL. Do not use them below this point! */
+               sdev = NULL;
+               svm = NULL;
+       no_pasid:
+               if (req->lpig || req->priv_data_present) {
+                       /*
+                        * Per VT-d spec. v3.0 ch7.7, system software must
+                        * respond with page group response if private data
+                        * is present (PDP) or last page in group (LPIG) bit
+                        * is set. This is an additional VT-d feature beyond
+                        * PCI ATS spec.
+                        */
+                       resp.qw0 = QI_PGRP_PASID(req->pasid) |
+                               QI_PGRP_DID(req->rid) |
+                               QI_PGRP_PASID_P(req->pasid_present) |
+                               QI_PGRP_PDP(req->pasid_present) |
+                               QI_PGRP_RESP_CODE(result) |
+                               QI_PGRP_RESP_TYPE;
+                       resp.qw1 = QI_PGRP_IDX(req->prg_index) |
+                               QI_PGRP_LPIG(req->lpig);
+                       if (req->priv_data_present)
+                               memcpy(&resp.qw2, req->priv_data,
+                                      sizeof(req->priv_data));
+                       resp.qw2 = 0;
+                       resp.qw3 = 0;
+                       qi_submit_sync(iommu, &resp, 1, 0);
+               }
+               head = (head + sizeof(*req)) & PRQ_RING_MASK;
+       }
+       dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
+       /*
+        * Clear the page request overflow bit and wake up all threads that
+        * are waiting for the completion of this handling.
+        */
+       if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
+               writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
+       if (!completion_done(&iommu->prq_complete))
+               complete(&iommu->prq_complete);
+       return IRQ_RETVAL(handled);
+ }
+ #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
+ struct iommu_sva *
+ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+ {
+       struct iommu_sva *sva = ERR_PTR(-EINVAL);
+       struct intel_svm_dev *sdev = NULL;
+       int flags = 0;
+       int ret;
+       /*
+        * TODO: Consolidate with generic iommu-sva bind after it is merged.
+        * It will require shared SVM data structures, i.e. combine io_mm
+        * and intel_svm etc.
+        */
+       if (drvdata)
+               flags = *(int *)drvdata;
+       mutex_lock(&pasid_mutex);
+       ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
+       if (ret)
+               sva = ERR_PTR(ret);
+       else if (sdev)
+               sva = &sdev->sva;
+       else
+               WARN(!sdev, "SVM bind succeeded with no sdev!\n");
+       mutex_unlock(&pasid_mutex);
+       return sva;
+ }
+ void intel_svm_unbind(struct iommu_sva *sva)
+ {
+       struct intel_svm_dev *sdev;
+       mutex_lock(&pasid_mutex);
+       sdev = to_intel_svm_dev(sva);
+       intel_svm_unbind_mm(sdev->dev, sdev->pasid);
+       mutex_unlock(&pasid_mutex);
+ }
+ int intel_svm_get_pasid(struct iommu_sva *sva)
+ {
+       struct intel_svm_dev *sdev;
+       int pasid;
+       mutex_lock(&pasid_mutex);
+       sdev = to_intel_svm_dev(sva);
+       pasid = sdev->pasid;
+       mutex_unlock(&pasid_mutex);
+       return pasid;
+ }