From ceb47e0d84d9d327ef3cd7091fb54f612394f189 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Wed, 24 Mar 2021 11:51:35 -0400 Subject: [PATCH] drm/amdgpu: Fix SDMA RAS error reporting on Aldebaran Fix the following issues with SDMA RAS error reporting: 1. Read the EDC_COUNTER2 register also to fetch error counts for all sub-blocks in SDMA. 2. SDMA RAS on Aldebaran suports single-bit uncorrectable errors only. So, report error count in UE count instead of CE count. Signed-off-by: Mukul Joshi Reviewed-By: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c | 35 ++++++++++++++++++++------ 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c index 6fcb95c89999..bf95007f0843 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c @@ -160,6 +160,7 @@ static const struct soc15_ras_field_entry sdma_v4_4_ras_fields[] = { }; static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev, + uint32_t reg_offset, uint32_t value, uint32_t instance, uint32_t *sec_count) @@ -169,6 +170,9 @@ static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev, /* double bits error (multiple bits) error detection is not supported */ for (i = 0; i < ARRAY_SIZE(sdma_v4_4_ras_fields); i++) { + if (sdma_v4_4_ras_fields[i].reg_offset != reg_offset) + continue; + /* the SDMA_EDC_COUNTER register in each sdma instance * shares the same sed shift_mask * */ @@ -197,13 +201,30 @@ static int sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev, reg_value = RREG32(reg_offset); /* double bit error is not supported */ if (reg_value) - sdma_v4_4_get_ras_error_count(adev, reg_value, instance, &sec_count); - /* err_data->ce_count should be initialized to 0 - * before calling into this function */ - err_data->ce_count += sec_count; - /* double bit error is not supported - * set ue count to 0 */ - err_data->ue_count = 0; + sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER, reg_value, + instance, &sec_count); + + reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER2); + reg_value = RREG32(reg_offset); + /* double bit error is not supported */ + if (reg_value) + sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER2, reg_value, + instance, &sec_count); + + /* + * err_data->ue_count should be initialized to 0 + * before calling into this function + * + * SDMA RAS supports single bit uncorrectable error detection. + * So, increment uncorrectable error count. + */ + err_data->ue_count += sec_count; + + /* + * SDMA RAS does not support correctable errors. + * Set ce count to 0. + */ + err_data->ce_count = 0; return 0; }; -- 2.34.1