drm/amdgpu: support reset flag set for gpu reset

author Likun Gao <Likun.Gao@amd.com>

Fri, 8 Jul 2022 03:14:05 +0000 (11:14 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 13 Jul 2022 15:25:17 +0000 (11:25 -0400)
author Likun Gao <Likun.Gao@amd.com>
Fri, 8 Jul 2022 03:14:05 +0000 (11:14 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 13 Jul 2022 15:25:17 +0000 (11:25 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index fb9399a..2871a3e 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1253,9 +1253,8 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
  bool amdgpu_device_has_job_running(struct amdgpu_device *adev);
  bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
-                             struct amdgpu_job* job);
-int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
-                             struct amdgpu_job *job);
+                             struct amdgpu_job *job,
+                             struct amdgpu_reset_context *reset_context);
  void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
  int amdgpu_device_pci_reset(struct amdgpu_device *adev);
  bool amdgpu_device_need_post(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

index 5675974..5e53a52 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -129,7 +129,14 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
         struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
                                                   kfd.reset_work);
  
-       amdgpu_device_gpu_recover(adev, NULL);
+       struct amdgpu_reset_context reset_context;
+       memset(&reset_context, 0, sizeof(reset_context));
+
+       reset_context.method = AMD_RESET_METHOD_NONE;
+       reset_context.reset_req_dev = adev;
+       clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+       amdgpu_device_gpu_recover(adev, NULL, &reset_context);
  }
  
  void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 64f3771..e1c9587 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5109,7 +5109,8 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
   */
  
  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
-                             struct amdgpu_job *job)
+                             struct amdgpu_job *job,
+                             struct amdgpu_reset_context *reset_context)
  {
         struct list_head device_list, *device_list_handle =  NULL;
         bool job_signaled = false;
@@ -5119,9 +5120,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         bool need_emergency_restart = false;
         bool audio_suspended = false;
         int tmp_vram_lost_counter;
-       struct amdgpu_reset_context reset_context;
-
-       memset(&reset_context, 0, sizeof(reset_context));
  
         /*
          * Special case: RAS triggered and full reset isn't supported
@@ -5147,12 +5145,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         if (hive)
                 mutex_lock(&hive->hive_lock);
  
-       reset_context.method = AMD_RESET_METHOD_NONE;
-       reset_context.reset_req_dev = adev;
-       reset_context.job = job;
-       reset_context.hive = hive;
-       clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
+       reset_context->job = job;
+       reset_context->hive = hive;
         /*
          * Build list of devices to reset.
          * In case we are in XGMI hive mode, resort the device list
@@ -5245,7 +5239,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  
  retry: /* Rest of adevs pre asic reset from XGMI hive. */
         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-               r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
+               r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
                 /*TODO Should we stop ?*/
                 if (r) {
                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
@@ -5272,7 +5266,7 @@ retry:    /* Rest of adevs pre asic reset from XGMI hive. */
                 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
                         amdgpu_ras_resume(adev);
         } else {
-               r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
+               r = amdgpu_do_asic_reset(device_list_handle, reset_context);
                 if (r && r == -EAGAIN)
                         goto retry;
         }
@@ -5292,7 +5286,7 @@ skip_hw_reset:
                 if (amdgpu_gpu_recovery == 2 &&
                         !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
                         amdgpu_device_recheck_guilty_jobs(
-                               tmp_adev, device_list_handle, &reset_context);
+                               tmp_adev, device_list_handle, reset_context);
  
                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                         struct amdgpu_ring *ring = tmp_adev->rings[i];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

index 39597ab..ff659d4 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -844,7 +844,14 @@ static void amdgpu_debugfs_reset_work(struct work_struct *work)
         struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
                                                   reset_work);
  
-       amdgpu_device_gpu_recover(adev, NULL);
+       struct amdgpu_reset_context reset_context;
+       memset(&reset_context, 0, sizeof(reset_context));
+
+       reset_context.method = AMD_RESET_METHOD_NONE;
+       reset_context.reset_req_dev = adev;
+       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+       amdgpu_device_gpu_recover(adev, NULL, &reset_context);
  }
  
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index 2273579..36c1be7 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -29,6 +29,7 @@
  
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
+#include "amdgpu_reset.h"
  
  static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
  {
@@ -64,7 +65,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
                   ti.process_name, ti.tgid, ti.task_name, ti.pid);
  
         if (amdgpu_device_should_recover_gpu(ring->adev)) {
-               r = amdgpu_device_gpu_recover(ring->adev, job);
+               struct amdgpu_reset_context reset_context;
+               memset(&reset_context, 0, sizeof(reset_context));
+
+               reset_context.method = AMD_RESET_METHOD_NONE;
+               reset_context.reset_req_dev = adev;
+               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+               r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
                 if (r)
                         DRM_ERROR("GPU Recovery Failed: %d\n", r);
         } else {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index a5b6db5..ff5361f 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1942,8 +1942,16 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
                 amdgpu_put_xgmi_hive(hive);
         }
  
-       if (amdgpu_device_should_recover_gpu(ras->adev))
-               amdgpu_device_gpu_recover(ras->adev, NULL);
+       if (amdgpu_device_should_recover_gpu(ras->adev)) {
+               struct amdgpu_reset_context reset_context;
+               memset(&reset_context, 0, sizeof(reset_context));
+
+               reset_context.method = AMD_RESET_METHOD_NONE;
+               reset_context.reset_req_dev = adev;
+               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+               amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
+       }
         atomic_set(&ras->in_recovery, 0);
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c

index 7ec5b5c..12906ba 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -283,8 +283,16 @@ flr_done:
         /* Trigger recovery for world switch failure if no TDR */
         if (amdgpu_device_should_recover_gpu(adev)
                 && (!amdgpu_device_has_job_running(adev) ||
-               adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-               amdgpu_device_gpu_recover(adev, NULL);
+                       adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) {
+               struct amdgpu_reset_context reset_context;
+               memset(&reset_context, 0, sizeof(reset_context));
+
+               reset_context.method = AMD_RESET_METHOD_NONE;
+               reset_context.reset_req_dev = adev;
+               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+               amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+       }
  }
  
  static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c

index e18b75c..e07757e 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -310,8 +310,16 @@ flr_done:
                 adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
                 adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
                 adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
-               adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-               amdgpu_device_gpu_recover(adev, NULL);
+               adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) {
+               struct amdgpu_reset_context reset_context;
+               memset(&reset_context, 0, sizeof(reset_context));
+
+               reset_context.method = AMD_RESET_METHOD_NONE;
+               reset_context.reset_req_dev = adev;
+               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+               amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+       }
  }
  
  static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c

index c5016a9..288c414 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -522,8 +522,16 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
         }
  
         /* Trigger recovery due to world switch failure */
-       if (amdgpu_device_should_recover_gpu(adev))
-               amdgpu_device_gpu_recover(adev, NULL);
+       if (amdgpu_device_should_recover_gpu(adev)) {
+               struct amdgpu_reset_context reset_context;
+               memset(&reset_context, 0, sizeof(reset_context));
+
+               reset_context.method = AMD_RESET_METHOD_NONE;
+               reset_context.reset_req_dev = adev;
+               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+               amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+       }
  }
  
  static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
author	Likun Gao <Likun.Gao@amd.com>
	Fri, 8 Jul 2022 03:14:05 +0000 (11:14 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 13 Jul 2022 15:25:17 +0000 (11:25 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c		patch \| blob \| history