drm/amdgpu: perform mode2 reset for sdma fed error on gfx v11_0_3
authorYiPeng Chai <YiPeng.Chai@amd.com>
Tue, 16 May 2023 09:34:17 +0000 (17:34 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 14:38:19 +0000 (10:38 -0400)
perform mode2 reset for sdma fed error on gfx v11_0_3.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c

index 6bb4386..f2da69a 100644 (file)
@@ -2053,9 +2053,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
                /* Perform full reset in fatal error mode */
                if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
                        set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-               else
+               else {
                        clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
+                       if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+                               ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+                               reset_context.method = AMD_RESET_METHOD_MODE2;
+                       }
+               }
+
                amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
        }
        atomic_set(&ras->in_recovery, 0);
index bc43f7d..46bf188 100644 (file)
@@ -339,6 +339,8 @@ enum amdgpu_ras_ret {
 #define AMDGPU_RAS_ERR_STATUS_VALID    (1 << 1)
 #define AMDGPU_RAS_ERR_ADDRESS_VALID   (1 << 2)
 
+#define AMDGPU_RAS_GPU_RESET_MODE2_RESET  (0x1 << 0)
+
 struct amdgpu_ras_err_status_reg_entry {
        uint32_t hwip;
        uint32_t ip_inst;
@@ -427,6 +429,9 @@ struct amdgpu_ras {
 
        /* Indicates smu whether need update bad channel info */
        bool update_channel_flag;
+
+       /* Record special requirements of gpu reset caller */
+       uint32_t  gpu_reset_flags;
 };
 
 struct ras_fs_data {
index 068b958..26d6286 100644 (file)
@@ -84,8 +84,20 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
        /* Workaround: when vmid and pasid are both zero, trigger gpu reset in KGD. */
        if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
            (entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
-            !entry->vmid && !entry->pasid)
+            !entry->vmid && !entry->pasid) {
+               uint32_t rlc_status0 = 0;
+
+               rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0);
+
+               if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA0_FED_ERR) ||
+                   REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA1_FED_ERR)) {
+                       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+                       ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+               }
+
                amdgpu_ras_reset_gpu(adev);
+       }
 
        return 0;
 }