drm/amdgpu: race issue when jobs on 2 ring timeout
authorHorace Chen <horace.chen@amd.com>
Wed, 20 Jan 2021 14:03:28 +0000 (22:03 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 25 Jan 2021 22:45:16 +0000 (17:45 -0500)
Fix a racing issue when jobs on 2 rings timeout simultaneously.

If 2 rings timed out at the same time, the
amdgpu_device_gpu_recover will be reentered. Then the
adev->gmc.xgmi.head will be grabbed by 2 local linked list,
which may cause wild pointer issue in iterating.

lock the device earily to prevent the node be added to 2
different lists.

also increase karma for the skipped job since the job is also
timed out and should be guilty.

Signed-off-by: Horace Chen <horace.chen@amd.com>
Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index e4e1f04..9ae0f2d 100644 (file)
@@ -4461,6 +4461,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
        up_write(&adev->reset_sem);
 }
 
+/*
+ * to lockup a list of amdgpu devices in a hive safely, if not a hive
+ * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
+ *
+ * unlock won't require roll back.
+ */
+static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
+{
+       struct amdgpu_device *tmp_adev = NULL;
+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               if (!hive) {
+                       dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
+                       return -ENODEV;
+               }
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+                       if (!amdgpu_device_lock_adev(tmp_adev, hive))
+                               goto roll_back;
+               }
+       } else if (!amdgpu_device_lock_adev(adev, hive))
+               return -EAGAIN;
+
+       return 0;
+roll_back:
+       if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
+               /*
+                * if the lockup iteration break in the middle of a hive,
+                * it may means there may has a race issue,
+                * or a hive device locked up independently.
+                * we may be in trouble and may not, so will try to roll back
+                * the lock and give out a warnning.
+                */
+               dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
+               list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+                       amdgpu_device_unlock_adev(tmp_adev);
+               }
+       }
+       return -EAGAIN;
+}
+
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
 {
        struct pci_dev *p = NULL;
@@ -4574,20 +4614,36 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                        DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
                                job ? job->base.id : -1, hive->hive_id);
                        amdgpu_put_xgmi_hive(hive);
+                       if (job)
+                               drm_sched_increase_karma(&job->base);
                        return 0;
                }
                mutex_lock(&hive->hive_lock);
        }
 
        /*
+        * lock the device before we try to operate the linked list
+        * if didn't get the device lock, don't touch the linked list since
+        * others may iterating it.
+        */
+       r = amdgpu_device_lock_hive_adev(adev, hive);
+       if (r) {
+               dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
+                                       job ? job->base.id : -1);
+
+               /* even we skipped this reset, still need to set the job to guilty */
+               if (job)
+                       drm_sched_increase_karma(&job->base);
+               goto skip_recovery;
+       }
+
+       /*
         * Build list of devices to reset.
         * In case we are in XGMI hive mode, resort the device list
         * to put adev in the 1st position.
         */
        INIT_LIST_HEAD(&device_list);
        if (adev->gmc.xgmi.num_physical_nodes > 1) {
-               if (!hive)
-                       return -ENODEV;
                if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
                        list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
                device_list_handle = &hive->device_list;
@@ -4598,13 +4654,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-               if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
-                       dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
-                                 job ? job->base.id : -1);
-                       r = 0;
-                       goto skip_recovery;
-               }
-
                /*
                 * Try to put the audio codec into suspend state
                 * before gpu reset started.
@@ -4742,7 +4791,7 @@ skip_recovery:
                amdgpu_put_xgmi_hive(hive);
        }
 
-       if (r)
+       if (r && r != -EAGAIN)
                dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
        return r;
 }