drm/amdkfd: Fix circular lock in nocpsch path
authorAmber Lin <Amber.Lin@amd.com>
Mon, 7 Jun 2021 18:46:21 +0000 (14:46 -0400)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 19 Jul 2021 07:44:50 +0000 (09:44 +0200)
[ Upstream commit a7b2451d31cfa2e8aeccf3b35612ce33f02371fc ]

Calling free_mqd inside of destroy_queue_nocpsch_locked can cause a
circular lock. destroy_queue_nocpsch_locked is called under a DQM lock,
which is taken in MMU notifiers, potentially in FS reclaim context.
Taking another lock, which is BO reservation lock from free_mqd, while
causing an FS reclaim inside the DQM lock creates a problematic circular
lock dependency. Therefore move free_mqd out of
destroy_queue_nocpsch_locked and call it after unlocking DQM.

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index b971532..ffb3d37 100644 (file)
@@ -486,9 +486,6 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
        if (retval == -ETIME)
                qpd->reset_wavefronts = true;
 
-
-       mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-
        list_del(&q->list);
        if (list_empty(&qpd->queues_list)) {
                if (qpd->reset_wavefronts) {
@@ -523,6 +520,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
        int retval;
        uint64_t sdma_val = 0;
        struct kfd_process_device *pdd = qpd_to_pdd(qpd);
+       struct mqd_manager *mqd_mgr =
+               dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];
 
        /* Get the SDMA queue stats */
        if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
@@ -540,6 +539,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
                pdd->sdma_past_activity_counter += sdma_val;
        dqm_unlock(dqm);
 
+       mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+
        return retval;
 }
 
@@ -1632,7 +1633,7 @@ static int set_trap_handler(struct device_queue_manager *dqm,
 static int process_termination_nocpsch(struct device_queue_manager *dqm,
                struct qcm_process_device *qpd)
 {
-       struct queue *q, *next;
+       struct queue *q;
        struct device_process_node *cur, *next_dpn;
        int retval = 0;
        bool found = false;
@@ -1640,12 +1641,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
        dqm_lock(dqm);
 
        /* Clear all user mode queues */
-       list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+       while (!list_empty(&qpd->queues_list)) {
+               struct mqd_manager *mqd_mgr;
                int ret;
 
+               q = list_first_entry(&qpd->queues_list, struct queue, list);
+               mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+                               q->properties.type)];
                ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
                if (ret)
                        retval = ret;
+               dqm_unlock(dqm);
+               mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+               dqm_lock(dqm);
        }
 
        /* Unregister process */