drm/amdgpu: save error count in RAS poison handler
authorTao Zhou <tao.zhou1@amd.com>
Mon, 20 Dec 2021 08:36:54 +0000 (16:36 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 30 Dec 2021 13:54:45 +0000 (08:54 -0500)
Otherwise the RAS error count couldn't be queried from sysfs.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h

index 0bf09a9..776a947 100644 (file)
@@ -727,7 +727,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
 
        /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
        if (!adev->gmc.xgmi.connected_to_cpu)
-               amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+               amdgpu_umc_poison_handler(adev, &err_data, reset);
        else if (reset)
                amdgpu_amdkfd_gpu_reset(adev);
 }
index 0c33f36..46264a4 100644 (file)
 
 #include "amdgpu_ras.h"
 
-static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
-               void *ras_error_status,
-               struct amdgpu_iv_entry *entry)
-{
-       return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
-}
-
-int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
-{
-       int r;
-       struct ras_fs_if fs_info = {
-               .sysfs_name = "umc_err_count",
-       };
-       struct ras_ih_if ih_info = {
-               .cb = amdgpu_umc_process_ras_data_cb,
-       };
-
-       if (!adev->umc.ras_if) {
-               adev->umc.ras_if =
-                       kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
-               if (!adev->umc.ras_if)
-                       return -ENOMEM;
-               adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
-               adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-               adev->umc.ras_if->sub_block_index = 0;
-       }
-       ih_info.head = fs_info.head = *adev->umc.ras_if;
-
-       r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
-                                &fs_info, &ih_info);
-       if (r)
-               goto free;
-
-       if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
-               r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
-               if (r)
-                       goto late_fini;
-       } else {
-               r = 0;
-               goto free;
-       }
-
-       /* ras init of specific umc version */
-       if (adev->umc.ras_funcs &&
-           adev->umc.ras_funcs->err_cnt_init)
-               adev->umc.ras_funcs->err_cnt_init(adev);
-
-       return 0;
-
-late_fini:
-       amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
-free:
-       kfree(adev->umc.ras_if);
-       adev->umc.ras_if = NULL;
-       return r;
-}
-
-void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
-{
-       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
-                       adev->umc.ras_if) {
-               struct ras_common_if *ras_if = adev->umc.ras_if;
-               struct ras_ih_if ih_info = {
-                       .head = *ras_if,
-                       .cb = amdgpu_umc_process_ras_data_cb,
-               };
-
-               amdgpu_ras_late_fini(adev, ras_if, &ih_info);
-               kfree(ras_if);
-       }
-}
-
-int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
                void *ras_error_status,
                struct amdgpu_iv_entry *entry,
                bool reset)
@@ -180,6 +108,100 @@ int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
        return AMDGPU_RAS_SUCCESS;
 }
 
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+               void *ras_error_status,
+               bool reset)
+{
+       int ret;
+       struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+       struct ras_common_if head = {
+               .block = AMDGPU_RAS_BLOCK__UMC,
+       };
+       struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+       ret =
+               amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+
+       if (ret == AMDGPU_RAS_SUCCESS && obj) {
+               obj->err_data.ue_count += err_data->ue_count;
+               obj->err_data.ce_count += err_data->ce_count;
+       }
+
+       return ret;
+}
+
+static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+               void *ras_error_status,
+               struct amdgpu_iv_entry *entry)
+{
+       return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
+}
+
+int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
+{
+       int r;
+       struct ras_fs_if fs_info = {
+               .sysfs_name = "umc_err_count",
+       };
+       struct ras_ih_if ih_info = {
+               .cb = amdgpu_umc_process_ras_data_cb,
+       };
+
+       if (!adev->umc.ras_if) {
+               adev->umc.ras_if =
+                       kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+               if (!adev->umc.ras_if)
+                       return -ENOMEM;
+               adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
+               adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+               adev->umc.ras_if->sub_block_index = 0;
+       }
+       ih_info.head = fs_info.head = *adev->umc.ras_if;
+
+       r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
+                                &fs_info, &ih_info);
+       if (r)
+               goto free;
+
+       if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
+               r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
+               if (r)
+                       goto late_fini;
+       } else {
+               r = 0;
+               goto free;
+       }
+
+       /* ras init of specific umc version */
+       if (adev->umc.ras_funcs &&
+           adev->umc.ras_funcs->err_cnt_init)
+               adev->umc.ras_funcs->err_cnt_init(adev);
+
+       return 0;
+
+late_fini:
+       amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
+free:
+       kfree(adev->umc.ras_if);
+       adev->umc.ras_if = NULL;
+       return r;
+}
+
+void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
+{
+       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
+                       adev->umc.ras_if) {
+               struct ras_common_if *ras_if = adev->umc.ras_if;
+               struct ras_ih_if ih_info = {
+                       .head = *ras_if,
+                       .cb = amdgpu_umc_process_ras_data_cb,
+               };
+
+               amdgpu_ras_late_fini(adev, ras_if, &ih_info);
+               kfree(ras_if);
+       }
+}
+
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
                struct amdgpu_iv_entry *entry)
index 8d18d51..b72194e 100644 (file)
@@ -78,9 +78,8 @@ struct amdgpu_umc {
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
-int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
                void *ras_error_status,
-               struct amdgpu_iv_entry *entry,
                bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,