struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
+ bool gpu_reset_for_dev_remove = 0;
/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
+ gpu_reset_for_dev_remove =
+ test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+ test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
+
/*
* ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
amdgpu_ras_intr_cleared();
}
+ /* Since the mode1 reset affects base ip blocks, the
+ * phase1 ip blocks need to be resumed. Otherwise there
+ * will be a BIOS signature error and the psp bootloader
+ * can't load kdb on the next amdgpu install.
+ */
+ if (gpu_reset_for_dev_remove) {
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+ amdgpu_device_ip_resume_phase1(tmp_adev);
+
+ goto end;
+ }
+
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
bool need_emergency_restart = false;
bool audio_suspended = false;
int tmp_vram_lost_counter;
+ bool gpu_reset_for_dev_remove = false;
+
+ gpu_reset_for_dev_remove =
+ test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+ test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
/*
* Special case: RAS triggered and full reset isn't supported
retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+ if (gpu_reset_for_dev_remove) {
+ /* Workaroud for ASICs need to disable SMC first */
+ amdgpu_device_smu_fini_early(tmp_adev);
+ }
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
/*TODO Should we stop ?*/
if (r) {
adev->asic_reset_res = 0;
goto retry;
}
+
+ if (!r && gpu_reset_for_dev_remove)
+ goto recover_end;
}
skip_hw_reset:
amdgpu_device_unset_mp1_state(tmp_adev);
}
+recover_end:
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
pm_runtime_forbid(dev->dev);
}
+ if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)) {
+ bool need_to_reset_gpu = false;
+
+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ struct amdgpu_hive_info *hive;
+
+ hive = amdgpu_get_xgmi_hive(adev);
+ if (hive->device_remove_count == 0)
+ need_to_reset_gpu = true;
+ hive->device_remove_count++;
+ amdgpu_put_xgmi_hive(hive);
+ } else {
+ need_to_reset_gpu = true;
+ }
+
+ /* Workaround for ASICs need to reset SMU.
+ * Called only when the first device is removed.
+ */
+ if (need_to_reset_gpu) {
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+ }
+ }
+
amdgpu_driver_unload_kms(dev);
drm_dev_unplug(dev);