From bfa84da6185cb1897fcee0ac3815625d162d39f0 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 17 Mar 2023 17:13:46 +0800 Subject: [PATCH] drm/amdgpu: add RAS error count query for gfx_v9_4_3 Query GFX RAS ce/ue count. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 56 +++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 188b4d9..bfd041b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -3724,6 +3724,55 @@ static const struct soc15_reg_entry gfx_v9_4_3_ea_err_status_regs = { SOC15_REG_ENTRY(GC, 0, regGCEA_ERR_STATUS), 0, 1, 16 }; +static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev, + void *ras_error_status, int xcc_id) +{ + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + unsigned long ce_count = 0, ue_count = 0; + uint32_t i, j, k; + + mutex_lock(&adev->grbm_idx_mutex); + + for (i = 0; i < ARRAY_SIZE(gfx_v9_4_3_ce_reg_list); i++) { + for (j = 0; j < gfx_v9_4_3_ce_reg_list[i].se_num; j++) { + for (k = 0; k < gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst; k++) { + /* no need to select if instance number is 1 */ + if (gfx_v9_4_3_ce_reg_list[i].se_num > 1 || + gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst > 1) + gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id); + + amdgpu_ras_inst_query_ras_error_count(adev, + &(gfx_v9_4_3_ce_reg_list[i].reg_entry), + 1, + gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ce_reg_list[i].mem_id_type].mem_id_ent, + gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ce_reg_list[i].mem_id_type].size, + GET_INST(GC, xcc_id), + AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, + &ce_count); + + amdgpu_ras_inst_query_ras_error_count(adev, + &(gfx_v9_4_3_ue_reg_list[i].reg_entry), + 1, + gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].mem_id_ent, + gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].size, + GET_INST(GC, xcc_id), + AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, + &ue_count); + } + } + } + + gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, + xcc_id); + mutex_unlock(&adev->grbm_idx_mutex); + + /* the caller should make sure initialize value of + * err_data->ue_count and err_data->ce_count + */ + err_data->ce_count += ce_count; + err_data->ue_count += ue_count; +} + static void gfx_v9_4_3_inst_query_ea_err_status(struct amdgpu_device *adev, int xcc_id) { @@ -3826,6 +3875,13 @@ static void gfx_v9_4_3_inst_reset_ras_err_status(struct amdgpu_device *adev, gfx_v9_4_3_inst_reset_ea_err_status(adev, xcc_id); } +static void gfx_v9_4_3_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + amdgpu_gfx_ras_error_func(adev, ras_error_status, + gfx_v9_4_3_inst_query_ras_err_count); +} + static void gfx_v9_4_3_query_ras_error_status(struct amdgpu_device *adev) { amdgpu_gfx_ras_error_func(adev, NULL, gfx_v9_4_3_inst_query_ras_err_status); -- 2.7.4