drm/amdgpu: Conditionally reset RAS counters on boot
authorJohn Clements <john.clements@amd.com>
Mon, 17 May 2021 08:36:26 +0000 (16:36 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 20 May 2021 02:38:11 +0000 (22:38 -0400)
Only clear RAS error counters if perestent EDC harvesting is not supported

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 95d4f43a03df465b701aab408cdbce729a146b72..a0be0772c8b3c3bff7e95aa144152ad2a03c97c1 100644 (file)
@@ -607,7 +607,6 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev)
        struct ras_ih_if ih_info = {
                .cb = amdgpu_gfx_process_ras_data_cb,
        };
-       struct ras_query_if info = { 0 };
 
        if (!adev->gfx.ras_if) {
                adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
@@ -625,12 +624,8 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev)
                goto free;
 
        if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) {
-               if (adev->gmc.xgmi.connected_to_cpu) {
-                       info.head = *adev->gfx.ras_if;
-                       amdgpu_ras_query_error_status(adev, &info);
-               } else {
+               if (!amdgpu_persistent_edc_harvesting_supported(adev))
                        amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX);
-               }
 
                r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
                if (r)
index b1c57a5b6e898215f764bf09b652686f87277559..e3a4c3a7635ac050b64dc2249b8e63267054f021 100644 (file)
@@ -2194,7 +2194,7 @@ release_con:
        return r;
 }
 
-static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
+int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
 {
        if (adev->gmc.xgmi.connected_to_cpu)
                return 1;
index 201fbdee1d090ad94ec1a973fd108f8a4af91181..bfa40c8ecc94e0b16546331f6e0dc168148aa00a 100644 (file)
@@ -625,4 +625,7 @@ void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);
 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev);
 
 void amdgpu_release_ras_context(struct amdgpu_device *adev);
+
+int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);
+
 #endif
index 093ab98c31bb508a9b452af5441539dab72798e7..fb3c7588dde3d04f9ab81d4cc3be7f3c16f2824e 100644 (file)
@@ -1268,13 +1268,15 @@ static int gmc_v9_0_late_init(void *handle)
                }
        }
 
-       if (adev->mmhub.ras_funcs &&
-           adev->mmhub.ras_funcs->reset_ras_error_count)
-               adev->mmhub.ras_funcs->reset_ras_error_count(adev);
-
-       if (adev->hdp.ras_funcs &&
-           adev->hdp.ras_funcs->reset_ras_error_count)
-               adev->hdp.ras_funcs->reset_ras_error_count(adev);
+       if (!amdgpu_persistent_edc_harvesting_supported(adev)) {
+               if (adev->mmhub.ras_funcs &&
+                   adev->mmhub.ras_funcs->reset_ras_error_count)
+                       adev->mmhub.ras_funcs->reset_ras_error_count(adev);
+
+               if (adev->hdp.ras_funcs &&
+                   adev->hdp.ras_funcs->reset_ras_error_count)
+                       adev->hdp.ras_funcs->reset_ras_error_count(adev);
+       }
 
        r = amdgpu_gmc_ras_late_init(adev);
        if (r)