drm/amd/amdgpu: move inc gpu_reset_counter after drm_sched_stop
authorJingwen Chen <Jingwen.Chen2@amd.com>
Thu, 25 Feb 2021 09:08:02 +0000 (17:08 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 26 Feb 2021 22:19:21 +0000 (17:19 -0500)
Move gpu_reset_counter after drm_sched_stop to avoid race
condition caused by job submitted between reset_count +1 and
drm_sched_stop.

Signed-off-by: Jingwen Chen <Jingwen.Chen2@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 6447cd6..a0af650 100644 (file)
@@ -4449,7 +4449,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
                down_write(&adev->reset_sem);
        }
 
-       atomic_inc(&adev->gpu_reset_counter);
        switch (amdgpu_asic_reset_method(adev)) {
        case AMD_RESET_METHOD_MODE1:
                adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -4710,6 +4709,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                        if (need_emergency_restart)
                                amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
                }
+               atomic_inc(&tmp_adev->gpu_reset_counter);
        }
 
        if (need_emergency_restart)
@@ -5052,6 +5052,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
 
                        drm_sched_stop(&ring->sched, NULL);
                }
+               atomic_inc(&adev->gpu_reset_counter);
                return PCI_ERS_RESULT_NEED_RESET;
        case pci_channel_io_perm_failure:
                /* Permanent error, prepare for device removal */