drm/amdkfd: Process notifier release callback don't take mutex
authorPhilip Yang <Philip.Yang@amd.com>
Thu, 7 Jul 2022 20:54:19 +0000 (16:54 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 25 Jul 2022 13:31:05 +0000 (09:31 -0400)
Move process queues cleanup to deferred work kfd_process_wq_release, to
avoid potential deadlock circular locking warning:

 WARNING: possible circular locking dependency detected
               the existing dependency chain (in reverse order) is:
      -> #2
        ((work_completion)(&svms->deferred_list_work)){+.+.}-{0:0}:
        __flush_work+0x343/0x4a0
        svm_range_list_lock_and_flush_work+0x39/0xc0
        svm_range_set_attr+0xe8/0x1080 [amdgpu]
        kfd_ioctl+0x19b/0x600 [amdgpu]
        __x64_sys_ioctl+0x81/0xb0
        do_syscall_64+0x34/0x80
        entry_SYSCALL_64_after_hwframe+0x44/0xae

      -> #1 (&info->lock#2){+.+.}-{3:3}:
        __mutex_lock+0xa4/0x940
        amdgpu_amdkfd_gpuvm_acquire_process_vm+0x2e3/0x590
        kfd_process_device_init_vm+0x61/0x200 [amdgpu]
        kfd_ioctl_acquire_vm+0x83/0xb0 [amdgpu]
        kfd_ioctl+0x19b/0x600 [amdgpu]
        __x64_sys_ioctl+0x81/0xb0
        do_syscall_64+0x34/0x80
       entry_SYSCALL_64_after_hwframe+0x44/0xae

      -> #0 (&process->mutex){+.+.}-{3:3}:
        __lock_acquire+0x1365/0x23d0
        lock_acquire+0xc9/0x2e0
        __mutex_lock+0xa4/0x940
        kfd_process_notifier_release+0x96/0xe0 [amdgpu]
        __mmu_notifier_release+0x94/0x210
        exit_mmap+0x35/0x1f0
        mmput+0x63/0x120
        svm_range_deferred_list_work+0x177/0x2c0 [amdgpu]
        process_one_work+0x2a4/0x600
        worker_thread+0x39/0x3e0
        kthread+0x16d/0x1a0

  Possible unsafe locking scenario:

      CPU0                    CPU1
        ----                    ----
   lock((work_completion)(&svms->deferred_list_work));
                                lock(&info->lock#2);
             lock((work_completion)(&svms->deferred_list_work));
   lock(&process->mutex);

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_process.c

index d859172..6c83a51 100644 (file)
@@ -1115,6 +1115,15 @@ static void kfd_process_wq_release(struct work_struct *work)
        struct kfd_process *p = container_of(work, struct kfd_process,
                                             release_work);
 
+       kfd_process_dequeue_from_all_devices(p);
+       pqm_uninit(&p->pqm);
+
+       /* Signal the eviction fence after user mode queues are
+        * destroyed. This allows any BOs to be freed without
+        * triggering pointless evictions or waiting for fences.
+        */
+       dma_fence_signal(p->ef);
+
        kfd_process_remove_sysfs(p);
        kfd_iommu_unbind_process(p);
 
@@ -1179,20 +1188,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
        cancel_delayed_work_sync(&p->eviction_work);
        cancel_delayed_work_sync(&p->restore_work);
 
-       mutex_lock(&p->mutex);
-
-       kfd_process_dequeue_from_all_devices(p);
-       pqm_uninit(&p->pqm);
-
        /* Indicate to other users that MM is no longer valid */
        p->mm = NULL;
-       /* Signal the eviction fence after user mode queues are
-        * destroyed. This allows any BOs to be freed without
-        * triggering pointless evictions or waiting for fences.
-        */
-       dma_fence_signal(p->ef);
-
-       mutex_unlock(&p->mutex);
 
        mmu_notifier_put(&p->mmu_notifier);
 }