drm/amdgpu: reduce reset time
authorVictor Zhao <Victor.Zhao@amd.com>
Fri, 24 Jun 2022 04:00:06 +0000 (12:00 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 16 Aug 2022 22:14:31 +0000 (18:14 -0400)
In multi container use case, reset time is important, so skip ring
tests and cp halt wait during ip suspending for reset as they are
going to fail and cost more time on reset

v2: add a hang flag to indicate the reset comes from a job timeout,
skip ring test and cp halt wait in this case

v3: move hang flag to adev

Signed-off-by: Victor Zhao <Victor.Zhao@amd.com>
Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

index 66a26f2c51c130342d6d2b14d07323ecf35ebd7d..79bb6fd830949dc7caeddd5b83cfe7bcbbb43c93 100644 (file)
@@ -1065,6 +1065,7 @@ struct amdgpu_device {
        struct work_struct              reset_work;
 
        uint32_t                                                amdgpu_reset_level_mask;
+       bool                            job_hang;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
index 222d3d7ea07678fad9cbf01e55ed6f9325dbcf9d..454a78ba60d43aa1bdd521e9e532710988d84ec9 100644 (file)
@@ -477,7 +477,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
                kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
                                           RESET_QUEUES, 0, 0);
 
-       if (adev->gfx.kiq.ring.sched.ready)
+       if (adev->gfx.kiq.ring.sched.ready && !adev->job_hang)
                r = amdgpu_ring_test_helper(kiq_ring);
        spin_unlock(&adev->gfx.kiq.ring_lock);
 
index 36f49da15800f699162c5a1609b90fb27f6e094c..8f51adf3b3292e67ea20270a23086ac46c6cc360 100644 (file)
@@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
        }
 
        memset(&ti, 0, sizeof(struct amdgpu_task_info));
+       adev->job_hang = true;
 
        if (amdgpu_gpu_recovery &&
            amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
@@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
        }
 
 exit:
+       adev->job_hang = false;
        drm_dev_exit(idx);
        return DRM_GPU_SCHED_STAT_NOMINAL;
 }
index a2a4dc1844c0adc0c54ab850fa026e1f9ca62aec..1a915edccb9206a9517f18fc476b32400130b140 100644 (file)
@@ -5971,6 +5971,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
                WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
        }
 
+       if (adev->job_hang && !enable)
+               return 0;
+
        for (i = 0; i < adev->usec_timeout; i++) {
                if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
                        break;
@@ -7569,8 +7572,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
        for (i = 0; i < adev->gfx.num_gfx_rings; i++)
                kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
                                           PREEMPT_QUEUES, 0, 0);
-
-       return amdgpu_ring_test_helper(kiq_ring);
+       if (!adev->job_hang)
+               return amdgpu_ring_test_helper(kiq_ring);
+       else
+               return 0;
 }
 #endif