From 66399248feaf4a2fa4cd76765412a4139aca28e9 Mon Sep 17 00:00:00 2001 From: John Clements Date: Wed, 25 Mar 2020 15:56:31 +0800 Subject: [PATCH] drm/amdgpu: added xgmi ras error reset sequence added mechanism to clear xgmi ras status inbetween error queries Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 30 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 + 2 files changed, 31 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 95b3327..8c32155 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -604,6 +604,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) adev->gmc.xgmi.num_physical_nodes == 0) return 0; + amdgpu_xgmi_reset_ras_error_count(adev); + if (!adev->gmc.xgmi.ras_if) { adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); if (!adev->gmc.xgmi.ras_if) @@ -668,6 +670,32 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, return addr + dram_base_addr; } +static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) +{ + WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); + WREG32_PCIE(pcs_status_reg, 0); +} + +void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) +{ + uint32_t i; + + switch (adev->asic_type) { + case CHIP_ARCTURUS: + for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) + pcs_clear_status(adev, + xgmi_pcs_err_status_reg_arct[i]); + break; + case CHIP_VEGA20: + for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) + pcs_clear_status(adev, + xgmi_pcs_err_status_reg_vg20[i]); + break; + default: + break; + } +} + static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, uint32_t value, uint32_t *ue_count, @@ -758,6 +786,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, break; } + amdgpu_xgmi_reset_ras_error_count(adev); + err_data->ue_count += ue_cnt; err_data->ce_count += ce_cnt; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 4a92067..d5a6390 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -56,6 +56,7 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, uint64_t addr); int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status); +void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev); static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev, struct amdgpu_device *bo_adev) -- 2.7.4