drm/amdgpu: Enable mode-1 reset for RAS recovery in fatal error mode
authorYiPeng Chai <YiPeng.Chai@amd.com>
Tue, 8 Nov 2022 09:11:18 +0000 (17:11 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 17 Nov 2022 23:07:52 +0000 (18:07 -0500)
The patch is enabling mode-1 reset for RAS recovery in fatal error mode.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 0f5cbc7..3d5d5d4 100644 (file)
@@ -4593,6 +4593,10 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
        if (amdgpu_gpu_recovery == 0)
                goto disabled;
 
+       /* Skip soft reset check in fatal error mode */
+       if (!amdgpu_ras_is_poison_mode_supported(adev))
+               return true;
+
        if (!amdgpu_device_ip_check_soft_reset(adev)) {
                dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");
                return false;
index 693bce0..8fca3cc 100644 (file)
@@ -1948,7 +1948,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
-               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+               /* Perform full reset in fatal error mode */
+               if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
+                       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               else
+                       clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
                amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
        }