drm/amdgpu: Add query_ras_error_count for mmhub v1_8
authorHawking Zhang <Hawking.Zhang@amd.com>
Thu, 2 Feb 2023 13:00:39 +0000 (21:00 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 13:53:13 +0000 (09:53 -0400)
Add query_ras_error_count callback for mmhub v1_8.
It will be used to query and log mmhub error count
and memory block.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c

index d21bb6d..1ca9d4e 100644 (file)
 #ifndef __AMDGPU_MMHUB_H__
 #define __AMDGPU_MMHUB_H__
 
+enum amdgpu_mmhub_ras_memory_id {
+       AMDGPU_MMHUB_WGMI_PAGEMEM = 0,
+       AMDGPU_MMHUB_RGMI_PAGEMEM = 1,
+       AMDGPU_MMHUB_WDRAM_PAGEMEM = 2,
+       AMDGPU_MMHUB_RDRAM_PAGEMEM = 3,
+       AMDGPU_MMHUB_WIO_CMDMEM = 4,
+       AMDGPU_MMHUB_RIO_CMDMEM = 5,
+       AMDGPU_MMHUB_WGMI_CMDMEM = 6,
+       AMDGPU_MMHUB_RGMI_CMDMEM = 7,
+       AMDGPU_MMHUB_WDRAM_CMDMEM = 8,
+       AMDGPU_MMHUB_RDRAM_CMDMEM = 9,
+       AMDGPU_MMHUB_MAM_DMEM0 = 10,
+       AMDGPU_MMHUB_MAM_DMEM1 = 11,
+       AMDGPU_MMHUB_MAM_DMEM2 = 12,
+       AMDGPU_MMHUB_MAM_DMEM3 = 13,
+       AMDGPU_MMHUB_WRET_TAGMEM = 19,
+       AMDGPU_MMHUB_RRET_TAGMEM = 20,
+       AMDGPU_MMHUB_WIO_DATAMEM = 21,
+       AMDGPU_MMHUB_WGMI_DATAMEM = 22,
+       AMDGPU_MMHUB_WDRAM_DATAMEM = 23,
+       AMDGPU_MMHUB_MEMORY_BLOCK_LAST,
+};
+
 struct amdgpu_mmhub_ras {
        struct amdgpu_ras_block_object ras_block;
 };
index a8faf66..11240ca 100644 (file)
@@ -29,6 +29,7 @@
 
 #include "soc15_common.h"
 #include "soc15.h"
+#include "amdgpu_ras.h"
 
 #define regVM_L2_CNTL3_DEFAULT 0x80100007
 #define regVM_L2_CNTL4_DEFAULT 0x000000c1
@@ -579,3 +580,95 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
        .set_clockgating = mmhub_v1_8_set_clockgating,
        .get_clockgating = mmhub_v1_8_get_clockgating,
 };
+
+static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = {
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA0_CE_ERR_STATUS_LO, regMMEA0_CE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA0"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA1_CE_ERR_STATUS_LO, regMMEA1_CE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA1"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA2_CE_ERR_STATUS_LO, regMMEA2_CE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA2"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA3_CE_ERR_STATUS_LO, regMMEA3_CE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA3"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA4_CE_ERR_STATUS_LO, regMMEA4_CE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA4"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMM_CANE_CE_ERR_STATUS_LO, regMM_CANE_CE_ERR_STATUS_HI),
+       1, 0, "MM_CANE"},
+};
+
+static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ue_reg_list[] = {
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA0_UE_ERR_STATUS_LO, regMMEA0_UE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA0"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA1_UE_ERR_STATUS_LO, regMMEA1_UE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA1"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA2_UE_ERR_STATUS_LO, regMMEA2_UE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA2"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA3_UE_ERR_STATUS_LO, regMMEA3_UE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA3"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA4_UE_ERR_STATUS_LO, regMMEA4_UE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA4"},
+       {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMM_CANE_UE_ERR_STATUS_LO, regMM_CANE_UE_ERR_STATUS_HI),
+       1, 0, "MM_CANE"},
+};
+
+static const struct amdgpu_ras_memory_id_entry mmhub_v1_8_ras_memory_list[] = {
+       {AMDGPU_MMHUB_WGMI_PAGEMEM, "MMEA_WGMI_PAGEMEM"},
+       {AMDGPU_MMHUB_RGMI_PAGEMEM, "MMEA_RGMI_PAGEMEM"},
+       {AMDGPU_MMHUB_WDRAM_PAGEMEM, "MMEA_WDRAM_PAGEMEM"},
+       {AMDGPU_MMHUB_RDRAM_PAGEMEM, "MMEA_RDRAM_PAGEMEM"},
+       {AMDGPU_MMHUB_WIO_CMDMEM, "MMEA_WIO_CMDMEM"},
+       {AMDGPU_MMHUB_RIO_CMDMEM, "MMEA_RIO_CMDMEM"},
+       {AMDGPU_MMHUB_WGMI_CMDMEM, "MMEA_WGMI_CMDMEM"},
+       {AMDGPU_MMHUB_RGMI_CMDMEM, "MMEA_RGMI_CMDMEM"},
+       {AMDGPU_MMHUB_WDRAM_CMDMEM, "MMEA_WDRAM_CMDMEM"},
+       {AMDGPU_MMHUB_RDRAM_CMDMEM, "MMEA_RDRAM_CMDMEM"},
+       {AMDGPU_MMHUB_MAM_DMEM0, "MMEA_MAM_DMEM0"},
+       {AMDGPU_MMHUB_MAM_DMEM1, "MMEA_MAM_DMEM1"},
+       {AMDGPU_MMHUB_MAM_DMEM2, "MMEA_MAM_DMEM2"},
+       {AMDGPU_MMHUB_MAM_DMEM3, "MMEA_MAM_DMEM3"},
+       {AMDGPU_MMHUB_WRET_TAGMEM, "MMEA_WRET_TAGMEM"},
+       {AMDGPU_MMHUB_RRET_TAGMEM, "MMEA_RRET_TAGMEM"},
+       {AMDGPU_MMHUB_WIO_DATAMEM, "MMEA_WIO_DATAMEM"},
+       {AMDGPU_MMHUB_WGMI_DATAMEM, "MMEA_WGMI_DATAMEM"},
+       {AMDGPU_MMHUB_WDRAM_DATAMEM, "MMEA_WDRAM_DATAMEM"},
+};
+
+static void mmhub_v1_8_inst_query_ras_error_count(struct amdgpu_device *adev,
+                                                 uint32_t mmhub_inst,
+                                                 void *ras_err_status)
+{
+       struct ras_err_data *err_data = (struct ras_err_data *)ras_err_status;
+
+       amdgpu_ras_inst_query_ras_error_count(adev,
+                                       mmhub_v1_8_ce_reg_list,
+                                       ARRAY_SIZE(mmhub_v1_8_ce_reg_list),
+                                       mmhub_v1_8_ras_memory_list,
+                                       ARRAY_SIZE(mmhub_v1_8_ras_memory_list),
+                                       mmhub_inst,
+                                       AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE,
+                                       &err_data->ce_count);
+       amdgpu_ras_inst_query_ras_error_count(adev,
+                                       mmhub_v1_8_ue_reg_list,
+                                       ARRAY_SIZE(mmhub_v1_8_ue_reg_list),
+                                       mmhub_v1_8_ras_memory_list,
+                                       ARRAY_SIZE(mmhub_v1_8_ras_memory_list),
+                                       mmhub_inst,
+                                       AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+                                       &err_data->ue_count);
+}
+
+static void mmhub_v1_8_query_ras_error_count(struct amdgpu_device *adev,
+                                            void *ras_err_status)
+{
+       uint32_t inst_mask;
+       uint32_t i;
+
+       if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) {
+               dev_warn(adev->dev, "MMHUB RAS is not supported\n");
+               return;
+       }
+
+       inst_mask = adev->aid_mask;
+       for_each_inst(i, inst_mask)
+               mmhub_v1_8_inst_query_ras_error_count(adev, i, ras_err_status);
+}