drm/amdgpu: only harvest gcea/mmea error status in aldebaran
authorHawking Zhang <Hawking.Zhang@amd.com>
Fri, 16 Apr 2021 09:30:12 +0000 (17:30 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 21 Apr 2021 01:35:55 +0000 (21:35 -0400)
In aldebaran, driver only needs to harvest SDP
RdRspStatus, WrRspStatus and first parity error
on RdRsp data. Check error type before harvest
error information.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Stanley Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c

index 0070c091b2b7f789d2f679b22c0c45cb92a2491f..44d37e40ab5082f9d0c3e3e4249958b27a47c836 100644 (file)
@@ -808,7 +808,7 @@ static struct gfx_v9_4_2_utc_block gfx_v9_4_2_utc_blocks[] = {
          REG_SET_FIELD(0, ATC_L2_CACHE_4K_DSM_CNTL, WRITE_COUNTERS, 1) },
 };
 
-static const struct soc15_reg_entry gfx_v9_4_2_rdrsp_status_regs =
+static const struct soc15_reg_entry gfx_v9_4_2_ea_err_status_regs =
        { SOC15_REG_ENTRY(GC, 0, regGCEA_ERR_STATUS), 0, 1, 16 };
 
 static int gfx_v9_4_2_get_reg_error_count(struct amdgpu_device *adev,
@@ -1041,11 +1041,11 @@ static void gfx_v9_4_2_reset_ea_err_status(struct amdgpu_device *adev)
        uint32_t i, j;
 
        mutex_lock(&adev->grbm_idx_mutex);
-       for (i = 0; i < gfx_v9_4_2_rdrsp_status_regs.se_num; i++) {
-               for (j = 0; j < gfx_v9_4_2_rdrsp_status_regs.instance;
+       for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) {
+               for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance;
                     j++) {
                        gfx_v9_4_2_select_se_sh(adev, i, 0, j);
-                       WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_regs), 0x10);
+                       WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_regs), 0x10);
                }
        }
        gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
@@ -1090,17 +1090,20 @@ static void gfx_v9_4_2_query_ea_err_status(struct amdgpu_device *adev)
 
        mutex_lock(&adev->grbm_idx_mutex);
 
-       for (i = 0; i < gfx_v9_4_2_rdrsp_status_regs.se_num; i++) {
-               for (j = 0; j < gfx_v9_4_2_rdrsp_status_regs.instance;
+       for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) {
+               for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance;
                     j++) {
                        gfx_v9_4_2_select_se_sh(adev, i, 0, j);
                        reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
-                               gfx_v9_4_2_rdrsp_status_regs));
-                       if ((reg_value & 0xFFF) != GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS_MASK)
+                               gfx_v9_4_2_ea_err_status_regs));
+                       if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_WRRSP_STATUS) ||
+                           REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) {
                                dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n",
                                                j, reg_value);
+                       }
                        /* clear after read */
-                       WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_regs), 0x10);
+                       WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_regs), 0x10);
                }
        }
 
index f9dc13584f6c940d6b17653fa7d3953192d09332..1c999c56d995be1bdd91c199fa19a5e3a9641c12 100644 (file)
@@ -1286,7 +1286,7 @@ static void mmhub_v1_7_reset_ras_error_count(struct amdgpu_device *adev)
        }
 }
 
-static const struct soc15_reg_entry mmhub_v1_7_err_status_regs[] = {
+static const struct soc15_reg_entry mmhub_v1_7_ea_err_status_regs[] = {
        { SOC15_REG_ENTRY(MMHUB, 0, regMMEA0_ERR_STATUS), 0, 0, 0 },
        { SOC15_REG_ENTRY(MMHUB, 0, regMMEA1_ERR_STATUS), 0, 0, 0 },
        { SOC15_REG_ENTRY(MMHUB, 0, regMMEA2_ERR_STATUS), 0, 0, 0 },
@@ -1303,12 +1303,15 @@ static void mmhub_v1_7_query_ras_error_status(struct amdgpu_device *adev)
        if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB))
                return;
 
-       for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_err_status_regs); i++) {
+       for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_ea_err_status_regs); i++) {
                reg_value =
-                       RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_err_status_regs[i]));
-               if ((reg_value & 0xFFF) != MMEA0_ERR_STATUS__SDP_RDRSP_DATASTATUS_MASK)
+                       RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_ea_err_status_regs[i]));
+               if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_WRRSP_STATUS) ||
+                   REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) {
                        dev_warn(adev->dev, "MMHUB EA err detected at instance: %d, status: 0x%x!\n",
                                        i, reg_value);
+               }
        }
 }