From 322a7e005db78b8a46ead91b7e3df3514cb658f0 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 2 Feb 2023 20:54:08 +0800 Subject: [PATCH] drm/amdgpu: Add common helper to query ras error (v2) Add common helper to query ras error status and log error information, including memory block id and erorr count. The helpers are applicable to IP blocks that follow the new ras error logging design. For IP blocks that don't support the new design, please still implement ip specific helper to query ras error. v2: optimize struct amdgpu_ras_err_status_reg_entry and the implementaion in helper (Lijo/Tao) Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 119 ++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 54 +++++++++++++++ 2 files changed, 173 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 22f401f..57e86af 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3103,3 +3103,122 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, return 0; } + +void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name) +{ + if (!err_type_name) + return; + + switch (err_type) { + case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: + sprintf(err_type_name, "correctable"); + break; + case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: + sprintf(err_type_name, "uncorrectable"); + break; + default: + sprintf(err_type_name, "unknown"); + break; + } +} + +bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + uint32_t *memory_id) +{ + uint32_t err_status_lo_data, err_status_lo_offset; + + if (!reg_entry) + return false; + + err_status_lo_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, + reg_entry->seg_lo, reg_entry->reg_lo); + err_status_lo_data = RREG32(err_status_lo_offset); + + if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && + !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG)) + return false; + + *memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID); + + return true; +} + +bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + unsigned long *err_cnt) +{ + uint32_t err_status_hi_data, err_status_hi_offset; + + if (!reg_entry) + return false; + + err_status_hi_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, + reg_entry->seg_hi, reg_entry->reg_hi); + err_status_hi_data = RREG32(err_status_hi_offset); + + if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && + !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG)) + return false; + + /* read err count */ + *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT); + + return true; +} + +void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + const struct amdgpu_ras_memory_id_entry *mem_list, + uint32_t mem_list_size, + uint32_t instance, + uint32_t err_type, + unsigned long *err_count) +{ + uint32_t memory_id; + unsigned long err_cnt; + char err_type_name[16]; + uint32_t i, j; + + for (i = 0; i < reg_list_size; i++) { + /* query err_cnt from err_status_hi */ + if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i], + instance, &err_cnt) || + !err_cnt) + continue; + + /* query memory_id from err_status_lo */ + if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], + instance, &memory_id)) + continue; + + *err_count += err_cnt; + + /* log the errors */ + amdgpu_ras_get_error_type_name(err_type, err_type_name); + if (!mem_list) { + /* memory_list is not supported */ + dev_info(adev->dev, + "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n", + err_cnt, err_type_name, + reg_list[i].block_name, + instance, memory_id); + } else { + for (j = 0; j < mem_list_size; j++) { + if (memory_id == mem_list[j].memory_id) { + dev_info(adev->dev, + "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n", + err_cnt, err_type_name, + reg_list[i].block_name, + instance, mem_list[j].name); + break; + } + } + } + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 17b3d19..c820af7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -314,6 +314,43 @@ enum amdgpu_ras_ret { AMDGPU_RAS_PT, }; +/* ras error status reisger fields */ +#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0 +#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L +#define ERR_STATUS_LO__MEMORY_ID__SHIFT 0x18 +#define ERR_STATUS_LO__MEMORY_ID_MASK 0xFF000000L +#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG__SHIFT 0x2 +#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG_MASK 0x00000004L +#define ERR_STATUS__ERR_CNT__SHIFT 0x17 +#define ERR_STATUS__ERR_CNT_MASK 0x03800000L + +#define AMDGPU_RAS_REG_ENTRY(ip, inst, reg_lo, reg_hi) \ + ip##_HWIP, inst, reg_lo##_BASE_IDX, reg_lo, reg_hi##_BASE_IDX, reg_hi + +#define AMDGPU_RAS_REG_ENTRY_OFFSET(hwip, ip_inst, segment, reg) \ + (adev->reg_offset[hwip][ip_inst][segment] + (reg)) + +#define AMDGPU_RAS_ERR_INFO_VALID (1 << 0) +#define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1) +#define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2) + +struct amdgpu_ras_err_status_reg_entry { + uint32_t hwip; + uint32_t ip_inst; + uint32_t seg_lo; + uint32_t reg_lo; + uint32_t seg_hi; + uint32_t reg_hi; + uint32_t reg_inst; + uint32_t flags; + const char *block_name; +}; + +struct amdgpu_ras_memory_id_entry { + uint32_t memory_id; + const char *name; +}; + struct ras_common_if { enum amdgpu_ras_block block; enum amdgpu_ras_error_type type; @@ -696,4 +733,21 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); +void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name); +bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + uint32_t *memory_id); +bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + unsigned long *err_cnt); +void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + const struct amdgpu_ras_memory_id_entry *mem_list, + uint32_t mem_list_size, + uint32_t instance, + uint32_t err_type, + unsigned long *err_count); #endif -- 2.7.4