Add new parameter to control GPU recovery procedure.
v2:
Add auto logic where reset is disabled for bare metal and enabled
for SR-IOV.
Allow forced reset from debugfs.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
extern int amdgpu_job_hang_limit;
extern int amdgpu_lbpw;
extern int amdgpu_compute_multipipe;
+extern int amdgpu_gpu_recovery;
#ifdef CONFIG_DRM_AMDGPU_SI
extern int amdgpu_si_support;
#define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
/* Common functions */
-int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job);
+int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job, bool force);
bool amdgpu_need_backup(struct amdgpu_device *adev);
void amdgpu_pci_config_reset(struct amdgpu_device *adev);
bool amdgpu_need_post(struct amdgpu_device *adev);
*
* @adev: amdgpu device pointer
* @job: which job trigger hang
+ * @force forces reset regardless of amdgpu_gpu_recovery
*
* Attempt to reset the GPU if it has hung (all asics).
* Returns 0 for success or an error on failure.
*/
-int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
+int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job, bool force)
{
struct drm_atomic_state *state = NULL;
uint64_t reset_flags = 0;
return 0;
}
+ if (!force && (amdgpu_gpu_recovery == 0 ||
+ (amdgpu_gpu_recovery == -1 && !amdgpu_sriov_vf(adev)))) {
+ DRM_INFO("GPU recovery disabled.\n");
+ return 0;
+ }
+
dev_info(adev->dev, "GPU reset begin!\n");
mutex_lock(&adev->lock_reset);
int amdgpu_job_hang_limit = 0;
int amdgpu_lbpw = -1;
int amdgpu_compute_multipipe = -1;
+int amdgpu_gpu_recovery = -1; /* auto */
MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
MODULE_PARM_DESC(compute_multipipe, "Force compute queues to be spread across pipes (1 = enable, 0 = disable, -1 = auto)");
module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
+MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto");
+module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
+
#ifdef CONFIG_DRM_AMDGPU_SI
#if defined(CONFIG_DRM_RADEON) || defined(CONFIG_DRM_RADEON_MODULE)
struct amdgpu_device *adev = dev->dev_private;
seq_printf(m, "gpu recover\n");
- amdgpu_gpu_recover(adev, NULL);
+ amdgpu_gpu_recover(adev, NULL, true);
return 0;
}
reset_work);
if (!amdgpu_sriov_vf(adev))
- amdgpu_gpu_recover(adev, NULL);
+ amdgpu_gpu_recover(adev, NULL, false);
}
/* Disable *all* interrupts */
atomic_read(&job->ring->fence_drv.last_seq),
job->ring->fence_drv.sync_seq);
- amdgpu_gpu_recover(job->adev, job);
+ amdgpu_gpu_recover(job->adev, job, false);
}
int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
}
/* Trigger recovery due to world switch failure */
- amdgpu_gpu_recover(adev, NULL);
+ amdgpu_gpu_recover(adev, NULL, false);
}
static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
}
/* Trigger recovery due to world switch failure */
- amdgpu_gpu_recover(adev, NULL);
+ amdgpu_gpu_recover(adev, NULL, false);
}
static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,