drm/amdgpu: Add query_ras_error_count for sdma v4_4_2
authorHawking Zhang <Hawking.Zhang@amd.com>
Sun, 5 Feb 2023 14:54:50 +0000 (22:54 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 13:53:00 +0000 (09:53 -0400)
Add query_ras_error_count callback for sdma
v4_4_2. It will be used to query and log sdma
uncorrectable error count and memory block.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c

index 62afb28..513ac22 100644 (file)
@@ -62,6 +62,34 @@ struct amdgpu_sdma_instance {
        uint32_t                aid_id;
 };
 
+enum amdgpu_sdma_ras_memory_id {
+       AMDGPU_SDMA_MBANK_DATA_BUF0 = 1,
+       AMDGPU_SDMA_MBANK_DATA_BUF1 = 2,
+       AMDGPU_SDMA_MBANK_DATA_BUF2 = 3,
+       AMDGPU_SDMA_MBANK_DATA_BUF3 = 4,
+       AMDGPU_SDMA_MBANK_DATA_BUF4 = 5,
+       AMDGPU_SDMA_MBANK_DATA_BUF5 = 6,
+       AMDGPU_SDMA_MBANK_DATA_BUF6 = 7,
+       AMDGPU_SDMA_MBANK_DATA_BUF7 = 8,
+       AMDGPU_SDMA_MBANK_DATA_BUF8 = 9,
+       AMDGPU_SDMA_MBANK_DATA_BUF9 = 10,
+       AMDGPU_SDMA_MBANK_DATA_BUF10 = 11,
+       AMDGPU_SDMA_MBANK_DATA_BUF11 = 12,
+       AMDGPU_SDMA_MBANK_DATA_BUF12 = 13,
+       AMDGPU_SDMA_MBANK_DATA_BUF13 = 14,
+       AMDGPU_SDMA_MBANK_DATA_BUF14 = 15,
+       AMDGPU_SDMA_MBANK_DATA_BUF15 = 16,
+       AMDGPU_SDMA_UCODE_BUF = 17,
+       AMDGPU_SDMA_RB_CMD_BUF = 18,
+       AMDGPU_SDMA_IB_CMD_BUF = 19,
+       AMDGPU_SDMA_UTCL1_RD_FIFO = 20,
+       AMDGPU_SDMA_UTCL1_RDBST_FIFO = 21,
+       AMDGPU_SDMA_UTCL1_WR_FIFO = 22,
+       AMDGPU_SDMA_DATA_LUT_FIFO = 23,
+       AMDGPU_SDMA_SPLIT_DAT_BUF = 24,
+       AMDGPU_SDMA_MEMORY_BLOCK_LAST,
+};
+
 struct amdgpu_sdma_ras {
        struct amdgpu_ras_block_object ras_block;
 };
index dca0bee..925ca62 100644 (file)
@@ -2071,3 +2071,67 @@ struct amdgpu_xcp_ip_funcs sdma_v4_4_2_xcp_funcs = {
        .suspend = &sdma_v4_4_2_xcp_suspend,
        .resume = &sdma_v4_4_2_xcp_resume
 };
+
+static const struct amdgpu_ras_err_status_reg_entry sdma_v4_2_2_ue_reg_list[] = {
+       {AMDGPU_RAS_REG_ENTRY(SDMA0, 0, regSDMA_UE_ERR_STATUS_LO, regSDMA_UE_ERR_STATUS_HI),
+       1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SDMA"},
+};
+
+static const struct amdgpu_ras_memory_id_entry sdma_v4_4_2_ras_memory_list[] = {
+       {AMDGPU_SDMA_MBANK_DATA_BUF0, "SDMA_MBANK_DATA_BUF0"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF1, "SDMA_MBANK_DATA_BUF1"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF2, "SDMA_MBANK_DATA_BUF2"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF3, "SDMA_MBANK_DATA_BUF3"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF4, "SDMA_MBANK_DATA_BUF4"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF5, "SDMA_MBANK_DATA_BUF5"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF6, "SDMA_MBANK_DATA_BUF6"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF7, "SDMA_MBANK_DATA_BUF7"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF8, "SDMA_MBANK_DATA_BUF8"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF9, "SDMA_MBANK_DATA_BUF9"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF10, "SDMA_MBANK_DATA_BUF10"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF11, "SDMA_MBANK_DATA_BUF11"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF12, "SDMA_MBANK_DATA_BUF12"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF13, "SDMA_MBANK_DATA_BUF13"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF14, "SDMA_MBANK_DATA_BUF14"},
+       {AMDGPU_SDMA_MBANK_DATA_BUF15, "SDMA_MBANK_DATA_BUF15"},
+       {AMDGPU_SDMA_UCODE_BUF, "SDMA_UCODE_BUF"},
+       {AMDGPU_SDMA_RB_CMD_BUF, "SDMA_RB_CMD_BUF"},
+       {AMDGPU_SDMA_IB_CMD_BUF, "SDMA_IB_CMD_BUF"},
+       {AMDGPU_SDMA_UTCL1_RD_FIFO, "SDMA_UTCL1_RD_FIFO"},
+       {AMDGPU_SDMA_UTCL1_RDBST_FIFO, "SDMA_UTCL1_RDBST_FIFO"},
+       {AMDGPU_SDMA_UTCL1_WR_FIFO, "SDMA_UTCL1_WR_FIFO"},
+       {AMDGPU_SDMA_DATA_LUT_FIFO, "SDMA_DATA_LUT_FIFO"},
+       {AMDGPU_SDMA_SPLIT_DAT_BUF, "SDMA_SPLIT_DAT_BUF"},
+};
+
+static void sdma_v4_4_2_inst_query_ras_error_count(struct amdgpu_device *adev,
+                                                  uint32_t sdma_inst,
+                                                  void *ras_err_status)
+{
+       struct ras_err_data *err_data = (struct ras_err_data *)ras_err_status;
+
+       /* sdma v4_4_2 doesn't support query ce counts */
+       amdgpu_ras_inst_query_ras_error_count(adev,
+                                       sdma_v4_2_2_ue_reg_list,
+                                       ARRAY_SIZE(sdma_v4_2_2_ue_reg_list),
+                                       sdma_v4_4_2_ras_memory_list,
+                                       ARRAY_SIZE(sdma_v4_4_2_ras_memory_list),
+                                       sdma_inst,
+                                       AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+                                       &err_data->ue_count);
+}
+
+static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev,
+                                             void *ras_err_status)
+{
+       uint32_t inst_mask;
+       int i = 0;
+
+       inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
+       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+               for_each_inst(i, inst_mask)
+                       sdma_v4_4_2_inst_query_ras_error_count(adev, i, ras_err_status);
+       } else {
+               dev_warn(adev->dev, "SDMA RAS is not supported\n");
+       }
+}