drm/amdkfd: Rework kfd_locked handling
authorMukul Joshi <mukul.joshi@amd.com>
Tue, 31 May 2022 18:39:36 +0000 (14:39 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 13:45:35 +0000 (09:45 -0400)
Currently, even if kfd_locked is set, a process is first
created and then removed to work around a race condition
in updating kfd_locked flag. Rework kfd_locked handling to
ensure no processes is created if kfd_locked is set. This
is achieved by updating kfd_locked under kfd_processes_mutex.
With this there is no need for kfd_locked to be an atomic
counter. Instead, it can be a regular integer.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c

index 45e8da1..8b9acce 100644 (file)
@@ -146,13 +146,6 @@ static int kfd_open(struct inode *inode, struct file *filep)
        if (IS_ERR(process))
                return PTR_ERR(process);
 
-       if (kfd_is_locked()) {
-               dev_dbg(kfd_device, "kfd is locked!\n"
-                               "process %d unreferenced", process->pasid);
-               kfd_unref_process(process);
-               return -EAGAIN;
-       }
-
        /* filep now owns the reference returned by kfd_create_process */
        filep->private_data = process;
 
index df96c4c..eb2b44f 100644 (file)
@@ -42,7 +42,7 @@
  * once locked, kfd driver will stop any further GPU execution.
  * create process (open) will return -EAGAIN.
  */
-static atomic_t kfd_locked = ATOMIC_INIT(0);
+static int kfd_locked;
 
 #ifdef CONFIG_DRM_AMDGPU_CIK
 extern const struct kfd2kgd_calls gfx_v7_kfd2kgd;
@@ -880,7 +880,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
                        return ret;
        }
 
-       atomic_dec(&kfd_locked);
+       mutex_lock(&kfd_processes_mutex);
+       --kfd_locked;
+       mutex_unlock(&kfd_processes_mutex);
 
        for (i = 0; i < kfd->num_nodes; i++) {
                node = kfd->nodes[i];
@@ -893,21 +895,27 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 
 bool kfd_is_locked(void)
 {
-       return  (atomic_read(&kfd_locked) > 0);
+       lockdep_assert_held(&kfd_processes_mutex);
+       return  (kfd_locked > 0);
 }
 
 void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
 {
        struct kfd_node *node;
        int i;
+       int count;
 
        if (!kfd->init_complete)
                return;
 
        /* for runtime suspend, skip locking kfd */
        if (!run_pm) {
+               mutex_lock(&kfd_processes_mutex);
+               count = ++kfd_locked;
+               mutex_unlock(&kfd_processes_mutex);
+
                /* For first KFD device suspend all the KFD processes */
-               if (atomic_inc_return(&kfd_locked) == 1)
+               if (count == 1)
                        kfd_suspend_all_processes();
        }
 
@@ -933,7 +941,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
 
        /* for runtime resume, skip unlocking kfd */
        if (!run_pm) {
-               count = atomic_dec_return(&kfd_locked);
+               mutex_lock(&kfd_processes_mutex);
+               count = --kfd_locked;
+               mutex_unlock(&kfd_processes_mutex);
+
                WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
                if (count == 0)
                        ret = kfd_resume_all_processes();
index 5cfebcc..400b4dc 100644 (file)
@@ -201,6 +201,8 @@ extern int amdgpu_no_queue_eviction_on_vm_fault;
 /* Enable eviction debug messages */
 extern bool debug_evictions;
 
+extern struct mutex kfd_processes_mutex;
+
 enum cache_policy {
        cache_policy_coherent,
        cache_policy_noncoherent
index 9b1e84d..c3d43e6 100644 (file)
@@ -50,7 +50,7 @@ struct mm_struct;
  * Unique/indexed by mm_struct*
  */
 DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
-static DEFINE_MUTEX(kfd_processes_mutex);
+DEFINE_MUTEX(kfd_processes_mutex);
 
 DEFINE_SRCU(kfd_processes_srcu);
 
@@ -818,6 +818,12 @@ struct kfd_process *kfd_create_process(struct file *filep)
         */
        mutex_lock(&kfd_processes_mutex);
 
+       if (kfd_is_locked()) {
+               mutex_unlock(&kfd_processes_mutex);
+               pr_debug("KFD is locked! Cannot create process");
+               return ERR_PTR(-EINVAL);
+       }
+
        /* A prior open of /dev/kfd could have already created the process. */
        process = find_process(thread, false);
        if (process) {