drm/amdgpu: added xgmi ras error reset sequence
authorJohn Clements <john.clements@amd.com>
Wed, 25 Mar 2020 07:56:31 +0000 (15:56 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 1 Apr 2020 18:44:42 +0000 (14:44 -0400)
added mechanism to clear xgmi ras status inbetween error queries

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h

index 95b3327..8c32155 100644 (file)
@@ -604,6 +604,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
            adev->gmc.xgmi.num_physical_nodes == 0)
                return 0;
 
+       amdgpu_xgmi_reset_ras_error_count(adev);
+
        if (!adev->gmc.xgmi.ras_if) {
                adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
                if (!adev->gmc.xgmi.ras_if)
@@ -668,6 +670,32 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
        return addr + dram_base_addr;
 }
 
+static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
+{
+       WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
+       WREG32_PCIE(pcs_status_reg, 0);
+}
+
+void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
+{
+       uint32_t i;
+
+       switch (adev->asic_type) {
+       case CHIP_ARCTURUS:
+               for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
+                       pcs_clear_status(adev,
+                                        xgmi_pcs_err_status_reg_arct[i]);
+               break;
+       case CHIP_VEGA20:
+               for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
+                       pcs_clear_status(adev,
+                                        xgmi_pcs_err_status_reg_vg20[i]);
+               break;
+       default:
+               break;
+       }
+}
+
 static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
                                              uint32_t value,
                                              uint32_t *ue_count,
@@ -758,6 +786,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
                break;
        }
 
+       amdgpu_xgmi_reset_ras_error_count(adev);
+
        err_data->ue_count += ue_cnt;
        err_data->ce_count += ce_cnt;
 
index 4a92067..d5a6390 100644 (file)
@@ -56,6 +56,7 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
                                           uint64_t addr);
 int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
                                      void *ras_error_status);
+void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev);
 
 static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
                struct amdgpu_device *bo_adev)