drm/amdgpu: Adjust the sequence to query ras error info
authorHawking Zhang <Hawking.Zhang@amd.com>
Mon, 20 Mar 2023 09:51:30 +0000 (17:51 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 13:58:32 +0000 (09:58 -0400)
It turns out STATUS_VALID_FLAG needs to be checked
ahead of any other fields. ADDRESS_VALID_FLAG and
ERR_INFO_VALID_FLAG only manages ADDRESS and ERR_INFO
field respectively. driver should continue poll
ERR CNT field even ERR_INFO_VALD_FLAG is not set.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 5ae8960..64f80e8 100644 (file)
@@ -3164,7 +3164,8 @@ bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
 
        if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) &&
            !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG))
-               return false;
+               /* keep the check here in case we need to refer to the result later */
+               dev_dbg(adev->dev, "Invalid err_info field\n");
 
        /* read err count */
        *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT);
@@ -3187,17 +3188,17 @@ void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
        uint32_t i, j;
 
        for (i = 0; i < reg_list_size; i++) {
+               /* query memory_id from err_status_lo */
+               if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
+                                                        instance, &memory_id))
+                       continue;
+
                /* query err_cnt from err_status_hi */
                if (!amdgpu_ras_inst_get_err_cnt_field(adev, &reg_list[i],
                                                       instance, &err_cnt) ||
                    !err_cnt)
                        continue;
 
-               /* query memory_id from err_status_lo */
-               if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
-                                                        instance, &memory_id))
-                       continue;
-
                *err_count += err_cnt;
 
                /* log the errors */