drm/amdgpu: RAS harvest on driver load
authorJohn Clements <john.clements@amd.com>
Thu, 25 Mar 2021 09:10:10 +0000 (17:10 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Apr 2021 20:54:51 +0000 (16:54 -0400)
In event of RAS UE + warm reset, error counters shall be harvested and cleared on driver load

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 1d905bc..b0fe588 100644 (file)
@@ -2090,6 +2090,32 @@ release_con:
        return r;
 }
 
+static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
+{
+       if (adev->gmc.xgmi.connected_to_cpu)
+               return 1;
+       return 0;
+}
+
+static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
+                                       struct ras_common_if *ras_block)
+{
+       struct ras_query_if info = {
+               .head = *ras_block,
+       };
+
+       if (!amdgpu_persistent_edc_harvesting_supported(adev))
+               return 0;
+
+       if (amdgpu_ras_query_error_status(adev, &info) != 0)
+               DRM_WARN("RAS init harvest failure");
+
+       if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
+               DRM_WARN("RAS init harvest reset failure");
+
+       return 0;
+}
+
 /* helper function to handle common stuff in ip late init phase */
 int amdgpu_ras_late_init(struct amdgpu_device *adev,
                         struct ras_common_if *ras_block,
@@ -2119,6 +2145,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
                        return r;
        }
 
+       /* check for errors on warm reset edc persisant supported ASIC */
+       amdgpu_persistent_edc_harvesting(adev, ras_block);
+
        /* in resume phase, no need to create ras fs node */
        if (adev->in_suspend || amdgpu_in_reset(adev))
                return 0;