From: Philip Yang Date: Sat, 11 Jun 2022 00:15:29 +0000 (-0400) Subject: drm/amdkfd: Free queue after unmap queue success X-Git-Tag: v6.1-rc5~617^2~17^2~58 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ab8529b0cdb271d9b222cbbddb2641f3fca5df8f;p=platform%2Fkernel%2Flinux-starfive.git drm/amdkfd: Free queue after unmap queue success After queue unmap or remove from MES successfully, free queue sysfs entries, doorbell and remove from queue list. Otherwise, application may destroy queue again, cause below kernel warning or crash backtrace. For outstanding queues, either application forget to destroy or failed to destroy, kfd_process_notifier_release will remove queue sysfs entries, kfd_process_wq_release will free queue doorbell. v2: decrement_queue_count for MES queue refcount_t: underflow; use-after-free. WARNING: CPU: 7 PID: 3053 at lib/refcount.c:28 Call Trace: kobject_put+0xd6/0x1a0 kfd_procfs_del_queue+0x27/0x30 [amdgpu] pqm_destroy_queue+0xeb/0x240 [amdgpu] kfd_ioctl_destroy_queue+0x32/0x70 [amdgpu] kfd_ioctl+0x27d/0x500 [amdgpu] do_syscall_64+0x35/0x80 WARNING: CPU: 2 PID: 3053 at drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c:400 Call Trace: deallocate_doorbell.isra.0+0x39/0x40 [amdgpu] destroy_queue_cpsch+0xb3/0x270 [amdgpu] pqm_destroy_queue+0x108/0x240 [amdgpu] kfd_ioctl_destroy_queue+0x32/0x70 [amdgpu] kfd_ioctl+0x27d/0x500 [amdgpu] general protection fault, probably for non-canonical address 0xdead000000000108: Call Trace: pqm_destroy_queue+0xf0/0x200 [amdgpu] kfd_ioctl_destroy_queue+0x2f/0x60 [amdgpu] kfd_ioctl+0x19b/0x600 [amdgpu] Signed-off-by: Philip Yang Reviewed-by: Graham Sider Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 21aeb05..213246a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1872,6 +1872,22 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, } + if (q->properties.is_active) { + if (!dqm->dev->shared_resources.enable_mes) { + retval = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + if (retval == -ETIME) + qpd->reset_wavefronts = true; + } else { + retval = remove_queue_mes(dqm, q, qpd); + } + + if (retval) + goto failed_unmap_queue; + + decrement_queue_count(dqm, qpd, q); + } + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( q->properties.type)]; @@ -1885,17 +1901,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, list_del(&q->list); qpd->queue_count--; - if (q->properties.is_active) { - if (!dqm->dev->shared_resources.enable_mes) { - decrement_queue_count(dqm, qpd, q); - retval = execute_queues_cpsch(dqm, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - if (retval == -ETIME) - qpd->reset_wavefronts = true; - } else { - retval = remove_queue_mes(dqm, q, qpd); - } - } /* * Unconditionally decrement this counter, regardless of the queue's @@ -1912,6 +1917,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, return retval; +failed_unmap_queue: failed_try_destroy_debugged_queue: dqm_unlock(dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index dc00484..99f2a64 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -419,7 +419,6 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) } if (pqn->q) { - kfd_procfs_del_queue(pqn->q); dqm = pqn->q->device->dqm; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); if (retval) { @@ -439,6 +438,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (dev->shared_resources.enable_mes) amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->gang_ctx_bo); + kfd_procfs_del_queue(pqn->q); uninit_queue(pqn->q); }