drm/amdgpu: save umc error records
authorTao Zhou <tao.zhou1@amd.com>
Thu, 15 Aug 2019 08:15:08 +0000 (16:15 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 13 Sep 2019 22:50:40 +0000 (17:50 -0500)
save umc error records to ras bad page array

v2: add bad pages before gpu reset
v3: add NULL check for adev->umc.funcs

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
drivers/gpu/drm/amd/amdgpu/umc_v6_1.c

index bc1d459..96210e1 100644 (file)
@@ -347,7 +347,7 @@ struct ras_err_data {
        unsigned long ue_count;
        unsigned long ce_count;
        unsigned long err_addr_cnt;
-       uint64_t *err_addr;
+       struct eeprom_table_record *err_addr;
 };
 
 struct ras_err_handler_data {
index 2b4eb0b..9b06d77 100644 (file)
@@ -247,21 +247,43 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
                struct ras_err_data *err_data,
                struct amdgpu_iv_entry *entry)
 {
-       if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
-               kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-               if (adev->umc.funcs->query_ras_error_count)
-                       adev->umc.funcs->query_ras_error_count(adev, err_data);
+       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
+               return AMDGPU_RAS_SUCCESS;
+
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+       if (adev->umc.funcs &&
+           adev->umc.funcs->query_ras_error_count)
+           adev->umc.funcs->query_ras_error_count(adev, err_data);
+
+       if (adev->umc.funcs &&
+           adev->umc.funcs->query_ras_error_address &&
+           adev->umc.max_ras_err_cnt_per_query) {
+               err_data->err_addr =
+                       kcalloc(adev->umc.max_ras_err_cnt_per_query,
+                               sizeof(struct eeprom_table_record), GFP_KERNEL);
+               /* still call query_ras_error_address to clear error status
+                * even NOMEM error is encountered
+                */
+               if(!err_data->err_addr)
+                       DRM_WARN("Failed to alloc memory for umc error address record!\n");
+
                /* umc query_ras_error_address is also responsible for clearing
                 * error status
                 */
-               if (adev->umc.funcs->query_ras_error_address)
-                       adev->umc.funcs->query_ras_error_address(adev, err_data);
+               adev->umc.funcs->query_ras_error_address(adev, err_data);
+       }
+
+       /* only uncorrectable error needs gpu reset */
+       if (err_data->ue_count) {
+               if (err_data->err_addr_cnt &&
+                   amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+                                               err_data->err_addr_cnt))
+                       DRM_WARN("Failed to add ras bad page!\n");
 
-               /* only uncorrectable error needs gpu reset */
-               if (err_data->ue_count)
-                       amdgpu_ras_reset_gpu(adev, 0);
+               amdgpu_ras_reset_gpu(adev, 0);
        }
 
+       kfree(err_data->err_addr);
        return AMDGPU_RAS_SUCCESS;
 }
 
index 8502e73..09e316a 100644 (file)
@@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
                        RSMU_UMC_INDEX_MODE_EN, 0);
 }
 
+static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev)
+{
+       uint32_t rsmu_umc_index;
+
+       rsmu_umc_index = RREG32_SOC15(RSMU, 0,
+                               mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
+       return REG_GET_FIELD(rsmu_umc_index,
+                               RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
+                               RSMU_UMC_INDEX_INSTANCE);
+}
+
 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
                                                   uint32_t umc_reg_offset,
                                                   unsigned long *error_count)
@@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
                                         uint32_t umc_reg_offset, uint32_t channel_index)
 {
        uint32_t lsb, mc_umc_status_addr;
-       uint64_t mc_umc_status, err_addr;
+       uint64_t mc_umc_status, err_addr, retired_page;
+       struct eeprom_table_record *err_rec;
 
        mc_umc_status_addr =
                SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
                return;
        }
 
+       err_rec = &err_data->err_addr[err_data->err_addr_cnt];
        mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset);
 
        /* calculate error address if ue/ce error is detected */
@@ -191,12 +204,24 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
                err_addr &= ~((0x1ULL << lsb) - 1);
 
                /* translate umc channel address to soc pa, 3 parts are included */
-               err_data->err_addr[err_data->err_addr_cnt] =
-                                               ADDR_OF_8KB_BLOCK(err_addr) |
-                                               ADDR_OF_256B_BLOCK(channel_index) |
-                                               OFFSET_IN_256B_BLOCK(err_addr);
-
-               err_data->err_addr_cnt++;
+               retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
+                               ADDR_OF_256B_BLOCK(channel_index) |
+                               OFFSET_IN_256B_BLOCK(err_addr);
+
+               /* we only save ue error information currently, ce is skipped */
+               if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
+                               == 1) {
+                       err_rec->address = err_addr;
+                       /* page frame address is saved */
+                       err_rec->retired_page = retired_page >> PAGE_SHIFT;
+                       err_rec->ts = (uint64_t)ktime_get_real_seconds();
+                       err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+                       err_rec->cu = 0;
+                       err_rec->mem_channel = channel_index;
+                       err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev);
+
+                       err_data->err_addr_cnt++;
+               }
        }
 
        /* clear umc status */