drm/amdgpu: add gpu reset control for umc page retirement
authorTao Zhou <tao.zhou1@amd.com>
Fri, 10 Dec 2021 09:01:15 +0000 (17:01 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 28 Dec 2021 21:02:32 +0000 (16:02 -0500)
Add a reset parameter for umc page retirement, let user decide whether
call gpu reset in umc page retirement.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h

index 6e4bea0..0c33f36 100644 (file)
 
 #include "amdgpu_ras.h"
 
+static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+               void *ras_error_status,
+               struct amdgpu_iv_entry *entry)
+{
+       return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
+}
+
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
 {
        int r;
@@ -88,9 +95,10 @@ void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
        }
 }
 
-int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
                void *ras_error_status,
-               struct amdgpu_iv_entry *entry)
+               struct amdgpu_iv_entry *entry,
+               bool reset)
 {
        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -164,7 +172,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
                                adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
                }
 
-               amdgpu_ras_reset_gpu(adev);
+               if (reset)
+                       amdgpu_ras_reset_gpu(adev);
        }
 
        kfree(err_data->err_addr);
index 9e40bad..8d18d51 100644 (file)
@@ -78,9 +78,10 @@ struct amdgpu_umc {
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
-int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
                void *ras_error_status,
-               struct amdgpu_iv_entry *entry);
+               struct amdgpu_iv_entry *entry,
+               bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
                struct amdgpu_iv_entry *entry);