drm/amdgpu: Add common helper to query ras error (v2)
authorHawking Zhang <Hawking.Zhang@amd.com>
Thu, 2 Feb 2023 12:54:08 +0000 (20:54 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 13:52:50 +0000 (09:52 -0400)
Add common helper to query ras error status and
log error information, including memory block id
and erorr count. The helpers are applicable to IP
blocks that follow the new ras error logging design.
For IP blocks that don't support the new design,
please still implement ip specific helper to query
ras error.

v2: optimize struct amdgpu_ras_err_status_reg_entry
and the implementaion in helper (Lijo/Tao)

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index 22f401f..57e86af 100644 (file)
@@ -3103,3 +3103,122 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
 
        return 0;
 }
+
+void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name)
+{
+       if (!err_type_name)
+               return;
+
+       switch (err_type) {
+       case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
+               sprintf(err_type_name, "correctable");
+               break;
+       case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
+               sprintf(err_type_name, "uncorrectable");
+               break;
+       default:
+               sprintf(err_type_name, "unknown");
+               break;
+       }
+}
+
+bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev,
+                                        const struct amdgpu_ras_err_status_reg_entry *reg_entry,
+                                        uint32_t instance,
+                                        uint32_t *memory_id)
+{
+       uint32_t err_status_lo_data, err_status_lo_offset;
+
+       if (!reg_entry)
+               return false;
+
+       err_status_lo_offset =
+               AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
+                                           reg_entry->seg_lo, reg_entry->reg_lo);
+       err_status_lo_data = RREG32(err_status_lo_offset);
+
+       if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) &&
+           !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG))
+               return false;
+
+       *memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID);
+
+       return true;
+}
+
+bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
+                                      const struct amdgpu_ras_err_status_reg_entry *reg_entry,
+                                      uint32_t instance,
+                                      unsigned long *err_cnt)
+{
+       uint32_t err_status_hi_data, err_status_hi_offset;
+
+       if (!reg_entry)
+               return false;
+
+       err_status_hi_offset =
+               AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
+                                           reg_entry->seg_hi, reg_entry->reg_hi);
+       err_status_hi_data = RREG32(err_status_hi_offset);
+
+       if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) &&
+           !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG))
+               return false;
+
+       /* read err count */
+       *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT);
+
+       return true;
+}
+
+void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
+                                          const struct amdgpu_ras_err_status_reg_entry *reg_list,
+                                          uint32_t reg_list_size,
+                                          const struct amdgpu_ras_memory_id_entry *mem_list,
+                                          uint32_t mem_list_size,
+                                          uint32_t instance,
+                                          uint32_t err_type,
+                                          unsigned long *err_count)
+{
+       uint32_t memory_id;
+       unsigned long err_cnt;
+       char err_type_name[16];
+       uint32_t i, j;
+
+       for (i = 0; i < reg_list_size; i++) {
+               /* query err_cnt from err_status_hi */
+               if (!amdgpu_ras_inst_get_err_cnt_field(adev, &reg_list[i],
+                                                      instance, &err_cnt) ||
+                   !err_cnt)
+                       continue;
+
+               /* query memory_id from err_status_lo */
+               if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
+                                                        instance, &memory_id))
+                       continue;
+
+               *err_count += err_cnt;
+
+               /* log the errors */
+               amdgpu_ras_get_error_type_name(err_type, err_type_name);
+               if (!mem_list) {
+                       /* memory_list is not supported */
+                       dev_info(adev->dev,
+                                "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n",
+                                err_cnt, err_type_name,
+                                reg_list[i].block_name,
+                                instance, memory_id);
+               } else {
+                       for (j = 0; j < mem_list_size; j++) {
+                               if (memory_id == mem_list[j].memory_id) {
+                                       dev_info(adev->dev,
+                                                "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n",
+                                                err_cnt, err_type_name,
+                                                reg_list[i].block_name,
+                                                instance, mem_list[j].name);
+                                       break;
+                               }
+                       }
+               }
+       }
+}
index 17b3d19..c820af7 100644 (file)
@@ -314,6 +314,43 @@ enum amdgpu_ras_ret {
        AMDGPU_RAS_PT,
 };
 
+/* ras error status reisger fields */
+#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT    0x0
+#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK      0x00000001L
+#define ERR_STATUS_LO__MEMORY_ID__SHIFT                        0x18
+#define ERR_STATUS_LO__MEMORY_ID_MASK                  0xFF000000L
+#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG__SHIFT      0x2
+#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG_MASK                0x00000004L
+#define ERR_STATUS__ERR_CNT__SHIFT                     0x17
+#define ERR_STATUS__ERR_CNT_MASK                       0x03800000L
+
+#define AMDGPU_RAS_REG_ENTRY(ip, inst, reg_lo, reg_hi) \
+       ip##_HWIP, inst, reg_lo##_BASE_IDX, reg_lo, reg_hi##_BASE_IDX, reg_hi
+
+#define AMDGPU_RAS_REG_ENTRY_OFFSET(hwip, ip_inst, segment, reg) \
+       (adev->reg_offset[hwip][ip_inst][segment] + (reg))
+
+#define AMDGPU_RAS_ERR_INFO_VALID      (1 << 0)
+#define AMDGPU_RAS_ERR_STATUS_VALID    (1 << 1)
+#define AMDGPU_RAS_ERR_ADDRESS_VALID   (1 << 2)
+
+struct amdgpu_ras_err_status_reg_entry {
+       uint32_t hwip;
+       uint32_t ip_inst;
+       uint32_t seg_lo;
+       uint32_t reg_lo;
+       uint32_t seg_hi;
+       uint32_t reg_hi;
+       uint32_t reg_inst;
+       uint32_t flags;
+       const char *block_name;
+};
+
+struct amdgpu_ras_memory_id_entry {
+       uint32_t memory_id;
+       const char *name;
+};
+
 struct ras_common_if {
        enum amdgpu_ras_block block;
        enum amdgpu_ras_error_type type;
@@ -696,4 +733,21 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
                                struct amdgpu_ras_block_object *ras_block_obj);
 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev);
+void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name);
+bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev,
+                                        const struct amdgpu_ras_err_status_reg_entry *reg_entry,
+                                        uint32_t instance,
+                                        uint32_t *memory_id);
+bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
+                                      const struct amdgpu_ras_err_status_reg_entry *reg_entry,
+                                      uint32_t instance,
+                                      unsigned long *err_cnt);
+void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
+                                          const struct amdgpu_ras_err_status_reg_entry *reg_list,
+                                          uint32_t reg_list_size,
+                                          const struct amdgpu_ras_memory_id_entry *mem_list,
+                                          uint32_t mem_list_size,
+                                          uint32_t instance,
+                                          uint32_t err_type,
+                                          unsigned long *err_count);
 #endif