drm/amdkfd: Improve process termination handling

author Felix Kuehling <Felix.Kuehling@amd.com>

Wed, 27 Sep 2017 04:09:52 +0000 (00:09 -0400)

committer Oded Gabbay <oded.gabbay@gmail.com>

Wed, 27 Sep 2017 04:09:52 +0000 (00:09 -0400)
author Felix Kuehling <Felix.Kuehling@amd.com>
Wed, 27 Sep 2017 04:09:52 +0000 (00:09 -0400)
committer Oded Gabbay <oded.gabbay@gmail.com>
Wed, 27 Sep 2017 04:09:52 +0000 (00:09 -0400)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index f5a5988..dd7e445 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -296,65 +296,73 @@ out_deallocate_hqd:
         return retval;
  }
  
-static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
+/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
+ * to avoid asynchronized access
+ */
+static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
                                 struct qcm_process_device *qpd,
                                 struct queue *q)
  {
         int retval;
         struct mqd_manager *mqd;
  
-       retval = 0;
-
-       mutex_lock(&dqm->lock);
+       mqd = dqm->ops.get_mqd_manager(dqm,
+               get_mqd_type_from_queue_type(q->properties.type));
+       if (!mqd)
+               return -ENOMEM;
  
         if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
-               mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
-               if (mqd == NULL) {
-                       retval = -ENOMEM;
-                       goto out;
-               }
                 deallocate_hqd(dqm, q);
         } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
-               mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA);
-               if (mqd == NULL) {
-                       retval = -ENOMEM;
-                       goto out;
-               }
                 dqm->sdma_queue_count--;
                 deallocate_sdma_queue(dqm, q->sdma_id);
         } else {
                 pr_debug("q->properties.type %d is invalid\n",
                                 q->properties.type);
-               retval = -EINVAL;
-               goto out;
+               return -EINVAL;
         }
+       dqm->total_queue_count--;
  
         retval = mqd->destroy_mqd(mqd, q->mqd,
                                 KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
                                 KFD_UNMAP_LATENCY_MS,
                                 q->pipe, q->queue);
-
-       if (retval)
-               goto out;
+       if (retval == -ETIME)
+               qpd->reset_wavefronts = true;
  
         mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
  
         list_del(&q->list);
-       if (list_empty(&qpd->queues_list))
+       if (list_empty(&qpd->queues_list)) {
+               if (qpd->reset_wavefronts) {
+                       pr_warn("Resetting wave fronts (nocpsch) on dev %p\n",
+                                       dqm->dev);
+                       /* dbgdev_wave_reset_wavefronts has to be called before
+                        * deallocate_vmid(), i.e. when vmid is still in use.
+                        */
+                       dbgdev_wave_reset_wavefronts(dqm->dev,
+                                       qpd->pqm->process);
+                       qpd->reset_wavefronts = false;
+               }
+
                 deallocate_vmid(dqm, qpd, q);
+       }
         if (q->properties.is_active)
                 dqm->queue_count--;
  
-       /*
-        * Unconditionally decrement this counter, regardless of the queue's
-        * type
-        */
-       dqm->total_queue_count--;
-       pr_debug("Total of %d queues are accountable so far\n",
-                       dqm->total_queue_count);
+       return retval;
+}
  
-out:
+static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
+                               struct qcm_process_device *qpd,
+                               struct queue *q)
+{
+       int retval;
+
+       mutex_lock(&dqm->lock);
+       retval = destroy_queue_nocpsch_locked(dqm, qpd, q);
         mutex_unlock(&dqm->lock);
+
         return retval;
  }
  
@@ -921,10 +929,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
                                 enum kfd_unmap_queues_filter filter,
                                 uint32_t filter_param)
  {
-       int retval;
-       struct kfd_process_device *pdd;
-
-       retval = 0;
+       int retval = 0;
  
         if (!dqm->active_runlist)
                 return retval;
@@ -948,12 +953,9 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
         /* should be timed out */
         retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
                                 QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
-       if (retval) {
-               pdd = kfd_get_process_device_data(dqm->dev,
-                               kfd_get_process(current));
-               pdd->reset_wavefronts = true;
+       if (retval)
                 return retval;
-       }
+
         pm_release_ib(&dqm->packets);
         dqm->active_runlist = false;
  
@@ -1015,7 +1017,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
         if (q->properties.is_active)
                 dqm->queue_count--;
  
-       execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+       retval = execute_queues_cpsch(dqm,
+                               KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+       if (retval == -ETIME)
+               qpd->reset_wavefronts = true;
  
         mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
  
@@ -1105,6 +1110,108 @@ out:
         return retval;
  }
  
+static int process_termination_nocpsch(struct device_queue_manager *dqm,
+               struct qcm_process_device *qpd)
+{
+       struct queue *q, *next;
+       struct device_process_node *cur, *next_dpn;
+       int retval = 0;
+
+       mutex_lock(&dqm->lock);
+
+       /* Clear all user mode queues */
+       list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+               int ret;
+
+               ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
+               if (ret)
+                       retval = ret;
+       }
+
+       /* Unregister process */
+       list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
+               if (qpd == cur->qpd) {
+                       list_del(&cur->list);
+                       kfree(cur);
+                       dqm->processes_count--;
+                       break;
+               }
+       }
+
+       mutex_unlock(&dqm->lock);
+       return retval;
+}
+
+
+static int process_termination_cpsch(struct device_queue_manager *dqm,
+               struct qcm_process_device *qpd)
+{
+       int retval;
+       struct queue *q, *next;
+       struct kernel_queue *kq, *kq_next;
+       struct mqd_manager *mqd;
+       struct device_process_node *cur, *next_dpn;
+       enum kfd_unmap_queues_filter filter =
+               KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
+
+       retval = 0;
+
+       mutex_lock(&dqm->lock);
+
+       /* Clean all kernel queues */
+       list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
+               list_del(&kq->list);
+               dqm->queue_count--;
+               qpd->is_debug = false;
+               dqm->total_queue_count--;
+               filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
+       }
+
+       /* Clear all user mode queues */
+       list_for_each_entry(q, &qpd->queues_list, list) {
+               if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+                       dqm->sdma_queue_count--;
+
+               if (q->properties.is_active)
+                       dqm->queue_count--;
+
+               dqm->total_queue_count--;
+       }
+
+       /* Unregister process */
+       list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
+               if (qpd == cur->qpd) {
+                       list_del(&cur->list);
+                       kfree(cur);
+                       dqm->processes_count--;
+                       break;
+               }
+       }
+
+       retval = execute_queues_cpsch(dqm, filter, 0);
+       if (retval || qpd->reset_wavefronts) {
+               pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
+               dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
+               qpd->reset_wavefronts = false;
+       }
+
+       /* lastly, free mqd resources */
+       list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+               mqd = dqm->ops.get_mqd_manager(dqm,
+                       get_mqd_type_from_queue_type(q->properties.type));
+               if (!mqd) {
+                       retval = -ENOMEM;
+                       goto out;
+               }
+               list_del(&q->list);
+               mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+       }
+
+out:
+       mutex_unlock(&dqm->lock);
+       return retval;
+}
+
  struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
  {
         struct device_queue_manager *dqm;
@@ -1133,6 +1240,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
                 dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
                 dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
                 dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+               dqm->ops.process_termination = process_termination_cpsch;
                 break;
         case KFD_SCHED_POLICY_NO_HWS:
                 /* initialize dqm for no cp scheduling */
@@ -1147,6 +1255,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
                 dqm->ops.initialize = initialize_nocpsch;
                 dqm->ops.uninitialize = uninitialize;
                 dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+               dqm->ops.process_termination = process_termination_nocpsch;
                 break;
         default:
                 pr_err("Invalid scheduling policy %d\n", sched_policy);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

index 60d46ce..31c2b1f 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -77,6 +77,8 @@ struct device_process_node {
   * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the
   * memory apertures.
   *
+ * @process_termination: Clears all process queues belongs to that device.
+ *
   */
  
  struct device_queue_manager_ops {
@@ -120,6 +122,9 @@ struct device_queue_manager_ops {
                                            enum cache_policy alternate_policy,
                                            void __user *alternate_aperture_base,
                                            uint64_t alternate_aperture_size);
+
+       int (*process_termination)(struct device_queue_manager *dqm,
+                       struct qcm_process_device *qpd);
  };
  
  struct device_queue_manager_asic_ops {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 8af0d6f..2e13f5d 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -421,6 +421,12 @@ struct qcm_process_device {
         unsigned int queue_count;
         unsigned int vmid;
         bool is_debug;
+
+       /* This flag tells if we should reset all wavefronts on
+        * process termination
+        */
+       bool reset_wavefronts;
+
         /*
          * All the memory management data should be here too
          */
@@ -454,6 +460,8 @@ struct kfd_process_device {
         /* The device that owns this data. */
         struct kfd_dev *dev;
  
+       /* The process that owns this kfd_process_device. */
+       struct kfd_process *process;
  
         /* per-process-per device QCM data structure */
         struct qcm_process_device qpd;
@@ -469,10 +477,12 @@ struct kfd_process_device {
         /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
         enum kfd_pdd_bound bound;
  
-       /* This flag tells if we should reset all
-        * wavefronts on process termination
+       /* Flag used to tell the pdd has dequeued from the dqm.
+        * This is used to prevent dev->dqm->ops.process_termination() from
+        * being called twice when it is already called in IOMMU callback
+        * function.
          */
-       bool reset_wavefronts;
+       bool already_dequeued;
  };
  
  #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
@@ -659,6 +669,8 @@ struct process_queue_node {
         struct list_head process_queue_list;
  };
  
+void kfd_process_dequeue_from_device(struct kfd_process_device *pdd);
+void kfd_process_dequeue_from_all_devices(struct kfd_process *p);
  int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p);
  void pqm_uninit(struct process_queue_manager *pqm);
  int pqm_create_queue(struct process_queue_manager *pqm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

index 1325f88..3ccb3b5 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -171,9 +171,6 @@ static void kfd_process_wq_release(struct work_struct *work)
                 pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
                                 pdd->dev->id, p->pasid);
  
-               if (pdd->reset_wavefronts)
-                       dbgdev_wave_reset_wavefronts(pdd->dev, p);
-
                 if (pdd->bound == PDD_BOUND)
                         amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
  
@@ -237,24 +234,17 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
  
         mutex_lock(&p->mutex);
  
-       /* In case our notifier is called before IOMMU notifier */
+       kfd_process_dequeue_from_all_devices(p);
         pqm_uninit(&p->pqm);
  
         /* Iterate over all process device data structure and check
-        * if we should delete debug managers and reset all wavefronts
+        * if we should delete debug managers
          */
-       list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+       list_for_each_entry(pdd, &p->per_device_data, per_device_list)
                 if ((pdd->dev->dbgmgr) &&
                                 (pdd->dev->dbgmgr->pasid == p->pasid))
                         kfd_dbgmgr_destroy(pdd->dev->dbgmgr);
  
-               if (pdd->reset_wavefronts) {
-                       pr_warn("Resetting all wave fronts\n");
-                       dbgdev_wave_reset_wavefronts(pdd->dev, p);
-                       pdd->reset_wavefronts = false;
-               }
-       }
-
         mutex_unlock(&p->mutex);
  
         /*
@@ -368,8 +358,9 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
                 INIT_LIST_HEAD(&pdd->qpd.queues_list);
                 INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
                 pdd->qpd.dqm = dev->dqm;
-               pdd->reset_wavefronts = false;
+               pdd->process = p;
                 pdd->bound = PDD_UNBOUND;
+               pdd->already_dequeued = false;
                 list_add(&pdd->per_device_list, &p->per_device_data);
         }
  
@@ -498,19 +489,12 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
         if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid))
                 kfd_dbgmgr_destroy(dev->dbgmgr);
  
-       pqm_uninit(&p->pqm);
-
         pdd = kfd_get_process_device_data(dev, p);
-
-       if (!pdd) {
-               mutex_unlock(&p->mutex);
-               return;
-       }
-
-       if (pdd->reset_wavefronts) {
-               dbgdev_wave_reset_wavefronts(pdd->dev, p);
-               pdd->reset_wavefronts = false;
-       }
+       if (pdd)
+               /* For GPU relying on IOMMU, we need to dequeue here
+                * when PASID is still bound.
+                */
+               kfd_process_dequeue_from_device(pdd);
  
         mutex_unlock(&p->mutex);
  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c

index 68fe0d9..31ec3ca 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -63,6 +63,25 @@ static int find_available_queue_slot(struct process_queue_manager *pqm,
         return 0;
  }
  
+void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
+{
+       struct kfd_dev *dev = pdd->dev;
+
+       if (pdd->already_dequeued)
+               return;
+
+       dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd);
+       pdd->already_dequeued = true;
+}
+
+void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
+{
+       struct kfd_process_device *pdd;
+
+       list_for_each_entry(pdd, &p->per_device_data, per_device_list)
+               kfd_process_dequeue_from_device(pdd);
+}
+
  int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p)
  {
         INIT_LIST_HEAD(&pqm->queues);
@@ -78,21 +97,14 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p)
  
  void pqm_uninit(struct process_queue_manager *pqm)
  {
-       int retval;
         struct process_queue_node *pqn, *next;
  
         list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
-               retval = pqm_destroy_queue(
-                               pqm,
-                               (pqn->q != NULL) ?
-                                       pqn->q->properties.queue_id :
-                                       pqn->kq->queue->properties.queue_id);
-
-               if (retval != 0) {
-                       pr_err("failed to destroy queue\n");
-                       return;
-               }
+               uninit_queue(pqn->q);
+               list_del(&pqn->process_queue_list);
+               kfree(pqn);
         }
+
         kfree(pqm->queue_slot_bitmap);
         pqm->queue_slot_bitmap = NULL;
  }
@@ -290,9 +302,6 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
         if (pqn->q) {
                 dqm = pqn->q->device->dqm;
                 retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
-               if (retval != 0)
-                       return retval;
-
                 uninit_queue(pqn->q);
         }
author	Felix Kuehling <Felix.Kuehling@amd.com>
	Wed, 27 Sep 2017 04:09:52 +0000 (00:09 -0400)
committer	Oded Gabbay <oded.gabbay@gmail.com>
	Wed, 27 Sep 2017 04:09:52 +0000 (00:09 -0400)
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_priv.h		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_process.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c		patch \| blob \| history