drm/amdgpu: Rework amdgpu_device_lock_adev
authorAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Tue, 25 Jan 2022 16:32:47 +0000 (11:32 -0500)
committerAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Wed, 9 Feb 2022 17:18:39 +0000 (12:18 -0500)
This functions needs to be split into 2 parts where
one is called only once for locking single instance of
reset_domain's sem and reset flag and the other part
which handles MP1 states should still be called for
each device in XGMI hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://www.spinics.net/lists/amd-gfx/msg74118.html
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h

index e05d7cb..f69ab22 100644 (file)
@@ -4825,16 +4825,8 @@ end:
        return r;
 }
 
-static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
-                               struct amdgpu_hive_info *hive)
+static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
 {
-       atomic_set(&adev->reset_domain->in_gpu_reset, 1);
-
-       if (hive) {
-               down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock);
-       } else {
-               down_write(&adev->reset_domain->sem);
-       }
 
        switch (amdgpu_asic_reset_method(adev)) {
        case AMD_RESET_METHOD_MODE1:
@@ -4849,12 +4841,10 @@ static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
        }
 }
 
-static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
 {
        amdgpu_vf_error_trans_all(adev);
        adev->mp1_state = PP_MP1_STATE_NONE;
-       atomic_set(&adev->reset_domain->in_gpu_reset, 0);
-       up_write(&adev->reset_domain->sem);
 }
 
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@@ -5060,10 +5050,15 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
                device_list_handle = &device_list;
        }
 
+       /* We need to lock reset domain only once both for XGMI and single device */
+       tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+                                   reset_list);
+       amdgpu_device_lock_reset_domain(tmp_adev->reset_domain, hive);
+
        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 
-               amdgpu_device_lock_adev(tmp_adev, hive);
+               amdgpu_device_set_mp1_state(tmp_adev);
 
                /*
                 * Try to put the audio codec into suspend state
@@ -5213,9 +5208,14 @@ skip_sched_resume:
 
                if (audio_suspended)
                        amdgpu_device_resume_display_audio(tmp_adev);
-               amdgpu_device_unlock_adev(tmp_adev);
+
+               amdgpu_device_unset_mp1_state(tmp_adev);
        }
 
+       tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+                                           reset_list);
+       amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+
        if (hive) {
                mutex_unlock(&hive->hive_lock);
                amdgpu_put_xgmi_hive(hive);
@@ -5477,7 +5477,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
                 * Locking adev->reset_domain->sem will prevent any external access
                 * to GPU during PCI error recovery
                 */
-               amdgpu_device_lock_adev(adev, NULL);
+               amdgpu_device_lock_reset_domain(adev->reset_domain, NULL);
+               amdgpu_device_set_mp1_state(adev);
 
                /*
                 * Block any work scheduling as we do for regular GPU reset
@@ -5584,7 +5585,8 @@ out:
                DRM_INFO("PCIe error recovery succeeded\n");
        } else {
                DRM_ERROR("PCIe error recovery failed, err:%d", r);
-               amdgpu_device_unlock_adev(adev);
+               amdgpu_device_unset_mp1_state(adev);
+               amdgpu_device_unlock_reset_domain(adev->reset_domain);
        }
 
        return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
@@ -5621,7 +5623,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
                drm_sched_start(&ring->sched, true);
        }
 
-       amdgpu_device_unlock_adev(adev);
+       amdgpu_device_unset_mp1_state(adev);
+       amdgpu_device_unlock_reset_domain(adev->reset_domain);
 }
 
 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
index 5ab72c3..9b18ad0 100644 (file)
@@ -137,5 +137,24 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
        return reset_domain;
 }
 
+void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain,
+                                    struct amdgpu_hive_info *hive)
+{
+       atomic_set(&reset_domain->in_gpu_reset, 1);
+
+       if (hive) {
+               down_write_nest_lock(&reset_domain->sem, &hive->hive_lock);
+       } else {
+               down_write(&reset_domain->sem);
+       }
+}
+
+
+void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
+{
+       atomic_set(&reset_domain->in_gpu_reset, 0);
+       up_write(&reset_domain->sem);
+}
+
 
 
index ea6fc98..92de3b7 100644 (file)
@@ -118,5 +118,9 @@ static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma
        return queue_work(domain->wq, work);
 }
 
+void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain,
+                                    struct amdgpu_hive_info *hive);
+
+void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
 
 #endif