drm/amdgpu: Adjust removal control flow for smu v13_0_2

author YiPeng Chai <YiPeng.Chai@amd.com>

Wed, 7 Sep 2022 08:07:42 +0000 (16:07 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Mon, 19 Sep 2022 19:17:20 +0000 (15:17 -0400)
author YiPeng Chai <YiPeng.Chai@amd.com>
Wed, 7 Sep 2022 08:07:42 +0000 (16:07 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Mon, 19 Sep 2022 19:17:20 +0000 (15:17 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 47aac17..c268bd0 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4749,6 +4749,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
         struct amdgpu_device *tmp_adev = NULL;
         bool need_full_reset, skip_hw_reset, vram_lost = false;
         int r = 0;
+       bool gpu_reset_for_dev_remove = 0;
  
         /* Try reset handler method first */
         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@@ -4768,6 +4769,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
  
+       gpu_reset_for_dev_remove =
+               test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+                       test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
+
         /*
          * ASIC reset has to be done on all XGMI hive nodes ASAP
          * to allow proper links negotiation in FW (within 1 sec)
@@ -4812,6 +4817,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
                 amdgpu_ras_intr_cleared();
         }
  
+       /* Since the mode1 reset affects base ip blocks, the
+        * phase1 ip blocks need to be resumed. Otherwise there
+        * will be a BIOS signature error and the psp bootloader
+        * can't load kdb on the next amdgpu install.
+        */
+       if (gpu_reset_for_dev_remove) {
+               list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+                       amdgpu_device_ip_resume_phase1(tmp_adev);
+
+               goto end;
+       }
+
         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
                 if (need_full_reset) {
                         /* post card */
@@ -5134,6 +5151,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         bool need_emergency_restart = false;
         bool audio_suspended = false;
         int tmp_vram_lost_counter;
+       bool gpu_reset_for_dev_remove = false;
+
+       gpu_reset_for_dev_remove =
+                       test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+                               test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
  
         /*
          * Special case: RAS triggered and full reset isn't supported
@@ -5253,6 +5275,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  
  retry: /* Rest of adevs pre asic reset from XGMI hive. */
         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+               if (gpu_reset_for_dev_remove) {
+                       /* Workaroud for ASICs need to disable SMC first */
+                       amdgpu_device_smu_fini_early(tmp_adev);
+               }
                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
                 /*TODO Should we stop ?*/
                 if (r) {
@@ -5286,6 +5312,9 @@ retry:    /* Rest of adevs pre asic reset from XGMI hive. */
                         adev->asic_reset_res = 0;
                         goto retry;
                 }
+
+               if (!r && gpu_reset_for_dev_remove)
+                       goto recover_end;
         }
  
  skip_hw_reset:
@@ -5359,6 +5388,7 @@ skip_sched_resume:
                 amdgpu_device_unset_mp1_state(tmp_adev);
         }
  
+recover_end:
         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
                                             reset_list);
         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index 728a093..2e16210 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2186,6 +2186,36 @@ amdgpu_pci_remove(struct pci_dev *pdev)
                 pm_runtime_forbid(dev->dev);
         }
  
+       if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)) {
+               bool need_to_reset_gpu = false;
+
+               if (adev->gmc.xgmi.num_physical_nodes > 1) {
+                       struct amdgpu_hive_info *hive;
+
+                       hive = amdgpu_get_xgmi_hive(adev);
+                       if (hive->device_remove_count == 0)
+                               need_to_reset_gpu = true;
+                       hive->device_remove_count++;
+                       amdgpu_put_xgmi_hive(hive);
+               } else {
+                       need_to_reset_gpu = true;
+               }
+
+               /* Workaround for ASICs need to reset SMU.
+                * Called only when the first device is removed.
+                */
+               if (need_to_reset_gpu) {
+                       struct amdgpu_reset_context reset_context;
+
+                       memset(&reset_context, 0, sizeof(reset_context));
+                       reset_context.method = AMD_RESET_METHOD_NONE;
+                       reset_context.reset_req_dev = adev;
+                       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+                       set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
+                       amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+               }
+       }
+
         amdgpu_driver_unload_kms(dev);
  
         drm_dev_unplug(dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h

index f71b83c..dc43fcb 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -31,6 +31,7 @@ enum AMDGPU_RESET_FLAGS {
         AMDGPU_NEED_FULL_RESET = 0,
         AMDGPU_SKIP_HW_RESET = 1,
         AMDGPU_SKIP_MODE2_RESET = 2,
+       AMDGPU_RESET_FOR_DEVICE_REMOVE = 3,
  };
  
  struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h

index 552e6fb..30dcc16 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -43,6 +43,7 @@ struct amdgpu_hive_info {
         } pstate;
  
         struct amdgpu_reset_domain *reset_domain;
+       uint32_t device_remove_count;
  };
  
  struct amdgpu_pcs_ras_field {
author	YiPeng Chai <YiPeng.Chai@amd.com>
	Wed, 7 Sep 2022 08:07:42 +0000 (16:07 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Mon, 19 Sep 2022 19:17:20 +0000 (15:17 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h		patch \| blob \| history