drm/amdgpu: decouple EccErrCnt query and clear operation
authorGuchun Chen <guchun.chen@amd.com>
Sun, 26 Apr 2020 09:04:31 +0000 (17:04 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 27 Apr 2020 19:52:10 +0000 (15:52 -0400)
Due to hardware bug that when RSMU UMC index is disabled,
clear EccErrCnt at the first UMC instance will clean up all other
EccErrCnt registes from other instances at the same time. This
will break the correctable error count log in EccErrCnt register
once querying it. So decouple both to make error count query workable.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/umc_v6_1.c

index b735ee44f948d2d18a59bf662f725098e8956d71..418cf097c918a58a066d986bdca6d5b82519d3a1 100644 (file)
@@ -104,6 +104,81 @@ static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev,
        return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst;
 }
 
+static void umc_v6_1_clear_error_count_per_channel(struct amdgpu_device *adev,
+                                       uint32_t umc_reg_offset)
+{
+       uint32_t ecc_err_cnt_addr;
+       uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
+
+       if (adev->asic_type == CHIP_ARCTURUS) {
+               /* UMC 6_1_2 registers */
+               ecc_err_cnt_sel_addr =
+                       SOC15_REG_OFFSET(UMC, 0,
+                                       mmUMCCH0_0_EccErrCntSel_ARCT);
+               ecc_err_cnt_addr =
+                       SOC15_REG_OFFSET(UMC, 0,
+                                       mmUMCCH0_0_EccErrCnt_ARCT);
+       } else {
+               /* UMC 6_1_1 registers */
+               ecc_err_cnt_sel_addr =
+                       SOC15_REG_OFFSET(UMC, 0,
+                                       mmUMCCH0_0_EccErrCntSel);
+               ecc_err_cnt_addr =
+                       SOC15_REG_OFFSET(UMC, 0,
+                                       mmUMCCH0_0_EccErrCnt);
+       }
+
+       /* select the lower chip */
+       ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
+                                       umc_reg_offset) * 4);
+       ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
+                                       UMCCH0_0_EccErrCntSel,
+                                       EccErrCntCsSel, 0);
+       WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
+                       ecc_err_cnt_sel);
+
+       /* clear lower chip error count */
+       WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
+                       UMC_V6_1_CE_CNT_INIT);
+
+       /* select the higher chip */
+       ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
+                                       umc_reg_offset) * 4);
+       ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
+                                       UMCCH0_0_EccErrCntSel,
+                                       EccErrCntCsSel, 1);
+       WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
+                       ecc_err_cnt_sel);
+
+       /* clear higher chip error count */
+       WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
+                       UMC_V6_1_CE_CNT_INIT);
+}
+
+static void umc_v6_1_clear_error_count(struct amdgpu_device *adev)
+{
+       uint32_t umc_inst        = 0;
+       uint32_t ch_inst         = 0;
+       uint32_t umc_reg_offset  = 0;
+       uint32_t rsmu_umc_index_state =
+                               umc_v6_1_get_umc_index_mode_state(adev);
+
+       if (rsmu_umc_index_state)
+               umc_v6_1_disable_umc_index_mode(adev);
+
+       LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
+               umc_reg_offset = get_umc_6_reg_offset(adev,
+                                               umc_inst,
+                                               ch_inst);
+
+               umc_v6_1_clear_error_count_per_channel(adev,
+                                               umc_reg_offset);
+       }
+
+       if (rsmu_umc_index_state)
+               umc_v6_1_enable_umc_index_mode(adev);
+}
+
 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
                                                   uint32_t umc_reg_offset,
                                                   unsigned long *error_count)
@@ -136,23 +211,21 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
        ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
                                        EccErrCntCsSel, 0);
        WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
+
        ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
        *error_count +=
                (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
                 UMC_V6_1_CE_CNT_INIT);
-       /* clear the lower chip err count */
-       WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
 
        /* select the higher chip and check the err counter */
        ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
                                        EccErrCntCsSel, 1);
        WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
+
        ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
        *error_count +=
                (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
                 UMC_V6_1_CE_CNT_INIT);
-       /* clear the higher chip err count */
-       WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
 
        /* check for SRAM correctable error
          MCUMC_STATUS is a 64 bit register */
@@ -228,6 +301,8 @@ static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev,
 
        if (rsmu_umc_index_state)
                umc_v6_1_enable_umc_index_mode(adev);
+
+       umc_v6_1_clear_error_count(adev);
 }
 
 static void umc_v6_1_query_error_address(struct amdgpu_device *adev,