drm/amdkfd: deregister svm range
authorPhilip Yang <Philip.Yang@amd.com>
Wed, 8 Apr 2020 15:09:45 +0000 (11:09 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 21 Apr 2021 01:47:03 +0000 (21:47 -0400)
When application explicitly call unmap or unmap from mmput when
application exit, driver will receive MMU_NOTIFY_UNMAP event to remove
svm range from process svms object tree and list first, unmap from GPUs
(in the following patch).

Split the svm ranges to handle partial unmapping of svm ranges. To
avoid deadlocks, updating MMU notifiers, range lists and interval trees
is done in a deferred worker. New child ranges are attached to their
parent range's child_list until the worker can update the
svm_range_list. svm_range_set_attr flushes deferred work and takes the
mmap_write_lock to guarantee that it has an up-to-date svm_range_list.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_svm.c
drivers/gpu/drm/amd/amdkfd/kfd_svm.h

index cfdd34c2223c0e4d61521a467c5b146500ea9b46..6cb0c4168fa0ba2f912213891cb3f9dfad3e8d4f 100644 (file)
@@ -735,6 +735,9 @@ struct svm_range_list {
        struct mutex                    lock;
        struct rb_root_cached           objects;
        struct list_head                list;
+       struct work_struct              deferred_list_work;
+       struct list_head                deferred_range_list;
+       spinlock_t                      deferred_list_lock;
 };
 
 /* Process data */
index 2f92b92739bc0679522a77bb0f58733adc708019..72d54cf4acc8dbdb4e633016c8fe598e36e492f2 100644 (file)
@@ -137,6 +137,8 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
        INIT_LIST_HEAD(&prange->update_list);
        INIT_LIST_HEAD(&prange->remove_list);
        INIT_LIST_HEAD(&prange->insert_list);
+       INIT_LIST_HEAD(&prange->deferred_list);
+       INIT_LIST_HEAD(&prange->child_list);
        mutex_init(&prange->lock);
        svm_range_set_default_attributes(&prange->preferred_loc,
                                         &prange->prefetch_loc,
@@ -411,6 +413,18 @@ svm_range_split_head(struct svm_range *prange, struct svm_range *new,
        return r;
 }
 
+static void
+svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
+                   struct svm_range *pchild, enum svm_work_list_ops op)
+{
+       pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
+                pchild, pchild->start, pchild->last, prange, op);
+
+       pchild->work_item.mm = mm;
+       pchild->work_item.op = op;
+       list_add_tail(&pchild->child_list, &prange->child_list);
+}
+
 /*
  * Validation+GPU mapping with concurrent invalidation (MMU notifiers)
  *
@@ -470,6 +484,30 @@ unreserve_out:
        return r;
 }
 
+/**
+ * svm_range_list_lock_and_flush_work - flush pending deferred work
+ *
+ * @svms: the svm range list
+ * @mm: the mm structure
+ *
+ * Context: Returns with mmap write lock held, pending deferred work flushed
+ *
+ */
+static void
+svm_range_list_lock_and_flush_work(struct svm_range_list *svms,
+                                  struct mm_struct *mm)
+{
+retry_flush_work:
+       flush_work(&svms->deferred_list_work);
+       mmap_write_lock(mm);
+
+       if (list_empty(&svms->deferred_range_list))
+               return;
+       mmap_write_unlock(mm);
+       pr_debug("retry flush\n");
+       goto retry_flush_work;
+}
+
 static struct svm_range *svm_range_clone(struct svm_range *old)
 {
        struct svm_range *new;
@@ -610,15 +648,260 @@ out:
        return r;
 }
 
+static void
+svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
+                                           struct svm_range *prange)
+{
+       unsigned long start;
+       unsigned long last;
+
+       start = prange->notifier.interval_tree.start >> PAGE_SHIFT;
+       last = prange->notifier.interval_tree.last >> PAGE_SHIFT;
+
+       if (prange->start == start && prange->last == last)
+               return;
+
+       pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
+                 prange->svms, prange, start, last, prange->start,
+                 prange->last);
+
+       if (start != 0 && last != 0) {
+               interval_tree_remove(&prange->it_node, &prange->svms->objects);
+               svm_range_remove_notifier(prange);
+       }
+       prange->it_node.start = prange->start;
+       prange->it_node.last = prange->last;
+
+       interval_tree_insert(&prange->it_node, &prange->svms->objects);
+       svm_range_add_notifier_locked(mm, prange);
+}
+
+static void
+svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
+{
+       struct mm_struct *mm = prange->work_item.mm;
+
+       switch (prange->work_item.op) {
+       case SVM_OP_NULL:
+               pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
+                        svms, prange, prange->start, prange->last);
+               break;
+       case SVM_OP_UNMAP_RANGE:
+               pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n",
+                        svms, prange, prange->start, prange->last);
+               svm_range_unlink(prange);
+               svm_range_remove_notifier(prange);
+               svm_range_free(prange);
+               break;
+       case SVM_OP_UPDATE_RANGE_NOTIFIER:
+               pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
+                        svms, prange, prange->start, prange->last);
+               svm_range_update_notifier_and_interval_tree(mm, prange);
+               break;
+       case SVM_OP_ADD_RANGE:
+               pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
+                        prange->start, prange->last);
+               svm_range_add_to_svms(prange);
+               svm_range_add_notifier_locked(mm, prange);
+               break;
+       default:
+               WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange,
+                        prange->work_item.op);
+       }
+}
+
+static void svm_range_deferred_list_work(struct work_struct *work)
+{
+       struct svm_range_list *svms;
+       struct svm_range *prange;
+       struct mm_struct *mm;
+
+       svms = container_of(work, struct svm_range_list, deferred_list_work);
+       pr_debug("enter svms 0x%p\n", svms);
+
+       spin_lock(&svms->deferred_list_lock);
+       while (!list_empty(&svms->deferred_range_list)) {
+               prange = list_first_entry(&svms->deferred_range_list,
+                                         struct svm_range, deferred_list);
+               spin_unlock(&svms->deferred_list_lock);
+               pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
+                        prange->start, prange->last, prange->work_item.op);
+
+               mm = prange->work_item.mm;
+               mmap_write_lock(mm);
+               mutex_lock(&svms->lock);
+
+               /* Remove from deferred_list must be inside mmap write lock,
+                * otherwise, svm_range_list_lock_and_flush_work may hold mmap
+                * write lock, and continue because deferred_list is empty, then
+                * deferred_list handle is blocked by mmap write lock.
+                */
+               spin_lock(&svms->deferred_list_lock);
+               list_del_init(&prange->deferred_list);
+               spin_unlock(&svms->deferred_list_lock);
+
+               while (!list_empty(&prange->child_list)) {
+                       struct svm_range *pchild;
+
+                       pchild = list_first_entry(&prange->child_list,
+                                               struct svm_range, child_list);
+                       pr_debug("child prange 0x%p op %d\n", pchild,
+                                pchild->work_item.op);
+                       list_del_init(&pchild->child_list);
+                       svm_range_handle_list_op(svms, pchild);
+               }
+
+               svm_range_handle_list_op(svms, prange);
+               mutex_unlock(&svms->lock);
+               mmap_write_unlock(mm);
+
+               spin_lock(&svms->deferred_list_lock);
+       }
+       spin_unlock(&svms->deferred_list_lock);
+
+       pr_debug("exit svms 0x%p\n", svms);
+}
+
+static void
+svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
+                       struct mm_struct *mm, enum svm_work_list_ops op)
+{
+       spin_lock(&svms->deferred_list_lock);
+       /* if prange is on the deferred list */
+       if (!list_empty(&prange->deferred_list)) {
+               pr_debug("update exist prange 0x%p work op %d\n", prange, op);
+               WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n");
+               if (op != SVM_OP_NULL &&
+                   prange->work_item.op != SVM_OP_UNMAP_RANGE)
+                       prange->work_item.op = op;
+       } else {
+               prange->work_item.op = op;
+               prange->work_item.mm = mm;
+               list_add_tail(&prange->deferred_list,
+                             &prange->svms->deferred_range_list);
+               pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
+                        prange, prange->start, prange->last, op);
+       }
+       spin_unlock(&svms->deferred_list_lock);
+}
+
+static void schedule_deferred_list_work(struct svm_range_list *svms)
+{
+       spin_lock(&svms->deferred_list_lock);
+       if (!list_empty(&svms->deferred_range_list))
+               schedule_work(&svms->deferred_list_work);
+       spin_unlock(&svms->deferred_list_lock);
+}
+
+static void
+svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
+                     struct svm_range *prange, unsigned long start,
+                     unsigned long last)
+{
+       struct svm_range *head;
+       struct svm_range *tail;
+
+       if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
+               pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange,
+                        prange->start, prange->last);
+               return;
+       }
+       if (start > prange->last || last < prange->start)
+               return;
+
+       head = tail = prange;
+       if (start > prange->start)
+               svm_range_split(prange, prange->start, start - 1, &tail);
+       if (last < tail->last)
+               svm_range_split(tail, last + 1, tail->last, &head);
+
+       if (head != prange && tail != prange) {
+               svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
+               svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
+       } else if (tail != prange) {
+               svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
+       } else if (head != prange) {
+               svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
+       } else if (parent != prange) {
+               prange->work_item.op = SVM_OP_UNMAP_RANGE;
+       }
+}
+
+static void
+svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
+                        unsigned long start, unsigned long last)
+{
+       struct svm_range_list *svms;
+       struct svm_range *pchild;
+       struct kfd_process *p;
+       bool unmap_parent;
+
+       p = kfd_lookup_process_by_mm(mm);
+       if (!p)
+               return;
+       svms = &p->svms;
+
+       pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
+                prange, prange->start, prange->last, start, last);
+
+       unmap_parent = start <= prange->start && last >= prange->last;
+
+       list_for_each_entry(pchild, &prange->child_list, child_list)
+               svm_range_unmap_split(mm, prange, pchild, start, last);
+       svm_range_unmap_split(mm, prange, prange, start, last);
+
+       if (unmap_parent)
+               svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
+       else
+               svm_range_add_list_work(svms, prange, mm,
+                                       SVM_OP_UPDATE_RANGE_NOTIFIER);
+       schedule_deferred_list_work(svms);
+
+       kfd_unref_process(p);
+}
+
 /**
  * svm_range_cpu_invalidate_pagetables - interval notifier callback
  *
+ * MMU range unmap notifier to remove svm ranges
  */
 static bool
 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
                                    const struct mmu_notifier_range *range,
                                    unsigned long cur_seq)
 {
+       struct svm_range *prange;
+       unsigned long start;
+       unsigned long last;
+
+       if (range->event == MMU_NOTIFY_RELEASE)
+               return true;
+
+       start = mni->interval_tree.start;
+       last = mni->interval_tree.last;
+       start = (start > range->start ? start : range->start) >> PAGE_SHIFT;
+       last = (last < (range->end - 1) ? last : range->end - 1) >> PAGE_SHIFT;
+       pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n",
+                start, last, range->start >> PAGE_SHIFT,
+                (range->end - 1) >> PAGE_SHIFT,
+                mni->interval_tree.start >> PAGE_SHIFT,
+                mni->interval_tree.last >> PAGE_SHIFT, range->event);
+
+       prange = container_of(mni, struct svm_range, notifier);
+
+       svm_range_lock(prange);
+       mmu_interval_set_seq(mni, cur_seq);
+
+       switch (range->event) {
+       case MMU_NOTIFY_UNMAP:
+               svm_range_unmap_from_cpu(mni->mm, prange, start, last);
+               break;
+       default:
+               break;
+       }
+
+       svm_range_unlock(prange);
+
        return true;
 }
 
@@ -627,6 +910,9 @@ void svm_range_list_fini(struct kfd_process *p)
        mutex_destroy(&p->svms.lock);
 
        pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
+
+       /* Ensure list work is finished before process is destroyed */
+       flush_work(&p->svms.deferred_list_work);
 }
 
 int svm_range_list_init(struct kfd_process *p)
@@ -636,6 +922,9 @@ int svm_range_list_init(struct kfd_process *p)
        svms->objects = RB_ROOT_CACHED;
        mutex_init(&svms->lock);
        INIT_LIST_HEAD(&svms->list);
+       INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
+       INIT_LIST_HEAD(&svms->deferred_range_list);
+       spin_lock_init(&svms->deferred_list_lock);
 
        return 0;
 }
@@ -753,7 +1042,7 @@ svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
 
        mutex_lock(&process_info->lock);
 
-       mmap_write_lock(mm);
+       svm_range_list_lock_and_flush_work(svms, mm);
 
        if (!svm_range_is_valid(mm, start, size)) {
                pr_debug("invalid range\n");
index 10c73348c2c0a2e976c8553bb0f4c7cedaf814ae..173e93e138a92b45c49055eb46b7730c38577ccd 100644 (file)
 #include "amdgpu.h"
 #include "kfd_priv.h"
 
+enum svm_work_list_ops {
+       SVM_OP_NULL,
+       SVM_OP_UNMAP_RANGE,
+       SVM_OP_UPDATE_RANGE_NOTIFIER,
+       SVM_OP_ADD_RANGE
+};
+
+struct svm_work_list_item {
+       enum svm_work_list_ops op;
+       struct mm_struct *mm;
+};
+
 /**
  * struct svm_range - shared virtual memory range
  *
@@ -54,6 +66,9 @@
  * @actual_loc: the actual location, 0 for CPU, or GPU id
  * @granularity:migration granularity, log2 num pages
  * @notifier:   register mmu interval notifier
+ * @work_item:  deferred work item information
+ * @deferred_list: list header used to add range to deferred list
+ * @child_list: list header for split ranges which are not added to svms yet
  * @bitmap_access: index bitmap of GPUs which can access the range
  * @bitmap_aip: index bitmap of GPUs which can access the range in place
  *
@@ -79,6 +94,9 @@ struct svm_range {
        uint32_t                        actual_loc;
        uint8_t                         granularity;
        struct mmu_interval_notifier    notifier;
+       struct svm_work_list_item       work_item;
+       struct list_head                deferred_list;
+       struct list_head                child_list;
        DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
        DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
 };