drm/amdgpu: race issue when jobs on 2 ring timeout

author Horace Chen <horace.chen@amd.com>

Wed, 20 Jan 2021 14:03:28 +0000 (22:03 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Mon, 25 Jan 2021 22:45:16 +0000 (17:45 -0500)
author Horace Chen <horace.chen@amd.com>
Wed, 20 Jan 2021 14:03:28 +0000 (22:03 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Mon, 25 Jan 2021 22:45:16 +0000 (17:45 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index e4e1f04..9ae0f2d 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4461,6 +4461,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
         up_write(&adev->reset_sem);
  }
  
+/*
+ * to lockup a list of amdgpu devices in a hive safely, if not a hive
+ * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
+ *
+ * unlock won't require roll back.
+ */
+static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
+{
+       struct amdgpu_device *tmp_adev = NULL;
+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               if (!hive) {
+                       dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
+                       return -ENODEV;
+               }
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+                       if (!amdgpu_device_lock_adev(tmp_adev, hive))
+                               goto roll_back;
+               }
+       } else if (!amdgpu_device_lock_adev(adev, hive))
+               return -EAGAIN;
+
+       return 0;
+roll_back:
+       if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
+               /*
+                * if the lockup iteration break in the middle of a hive,
+                * it may means there may has a race issue,
+                * or a hive device locked up independently.
+                * we may be in trouble and may not, so will try to roll back
+                * the lock and give out a warnning.
+                */
+               dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
+               list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+                       amdgpu_device_unlock_adev(tmp_adev);
+               }
+       }
+       return -EAGAIN;
+}
+
  static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
  {
         struct pci_dev *p = NULL;
@@ -4574,20 +4614,36 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
                                 job ? job->base.id : -1, hive->hive_id);
                         amdgpu_put_xgmi_hive(hive);
+                       if (job)
+                               drm_sched_increase_karma(&job->base);
                         return 0;
                 }
                 mutex_lock(&hive->hive_lock);
         }
  
         /*
+        * lock the device before we try to operate the linked list
+        * if didn't get the device lock, don't touch the linked list since
+        * others may iterating it.
+        */
+       r = amdgpu_device_lock_hive_adev(adev, hive);
+       if (r) {
+               dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
+                                       job ? job->base.id : -1);
+
+               /* even we skipped this reset, still need to set the job to guilty */
+               if (job)
+                       drm_sched_increase_karma(&job->base);
+               goto skip_recovery;
+       }
+
+       /*
          * Build list of devices to reset.
          * In case we are in XGMI hive mode, resort the device list
          * to put adev in the 1st position.
          */
         INIT_LIST_HEAD(&device_list);
         if (adev->gmc.xgmi.num_physical_nodes > 1) {
-               if (!hive)
-                       return -ENODEV;
                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
                 device_list_handle = &hive->device_list;
@@ -4598,13 +4654,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  
         /* block all schedulers and reset given job's ring */
         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-               if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
-                       dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
-                                 job ? job->base.id : -1);
-                       r = 0;
-                       goto skip_recovery;
-               }
-
                 /*
                  * Try to put the audio codec into suspend state
                  * before gpu reset started.
@@ -4742,7 +4791,7 @@ skip_recovery:
                 amdgpu_put_xgmi_hive(hive);
         }
  
-       if (r)
+       if (r && r != -EAGAIN)
                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
         return r;
  }
author	Horace Chen <horace.chen@amd.com>
	Wed, 20 Jan 2021 14:03:28 +0000 (22:03 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Mon, 25 Jan 2021 22:45:16 +0000 (17:45 -0500)