drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran

author Mukul Joshi <mukul.joshi@amd.com>

Tue, 21 Sep 2021 00:48:23 +0000 (20:48 -0400)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 13 Oct 2021 18:14:48 +0000 (14:14 -0400)
author Mukul Joshi <mukul.joshi@amd.com>
Tue, 21 Sep 2021 00:48:23 +0000 (20:48 -0400)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 13 Oct 2021 18:14:48 +0000 (14:14 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index e887535..08133de 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -112,7 +112,12 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
  static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
                                 uint64_t addr);
  #ifdef CONFIG_X86_MCE_AMD
-static void amdgpu_register_bad_pages_mca_notifier(void);
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
+struct mce_notifier_adev_list {
+       struct amdgpu_device *devs[MAX_GPU_INSTANCE];
+       int num_gpu;
+};
+static struct mce_notifier_adev_list mce_adev_list;
  #endif
  
  void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
@@ -2108,7 +2113,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
  #ifdef CONFIG_X86_MCE_AMD
         if ((adev->asic_type == CHIP_ALDEBARAN) &&
             (adev->gmc.xgmi.connected_to_cpu))
-               amdgpu_register_bad_pages_mca_notifier();
+               amdgpu_register_bad_pages_mca_notifier(adev);
  #endif
         return 0;
  
@@ -2605,24 +2610,18 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
  #ifdef CONFIG_X86_MCE_AMD
  static struct amdgpu_device *find_adev(uint32_t node_id)
  {
-       struct amdgpu_gpu_instance *gpu_instance;
         int i;
         struct amdgpu_device *adev = NULL;
  
-       mutex_lock(&mgpu_info.mutex);
-
-       for (i = 0; i < mgpu_info.num_gpu; i++) {
-               gpu_instance = &(mgpu_info.gpu_ins[i]);
-               adev = gpu_instance->adev;
+       for (i = 0; i < mce_adev_list.num_gpu; i++) {
+               adev = mce_adev_list.devs[i];
  
-               if (adev->gmc.xgmi.connected_to_cpu &&
+               if (adev && adev->gmc.xgmi.connected_to_cpu &&
                     adev->gmc.xgmi.physical_node_id == node_id)
                         break;
                 adev = NULL;
         }
  
-       mutex_unlock(&mgpu_info.mutex);
-
         return adev;
  }
  
@@ -2718,9 +2717,19 @@ static struct notifier_block amdgpu_bad_page_nb = {
         .priority       = MCE_PRIO_UC,
  };
  
-static void amdgpu_register_bad_pages_mca_notifier(void)
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
  {
         /*
+        * Add the adev to the mce_adev_list.
+        * During mode2 reset, amdgpu device is temporarily
+        * removed from the mgpu_info list which can cause
+        * page retirement to fail.
+        * Use this list instead of mgpu_info to find the amdgpu
+        * device on which the UMC error was reported.
+        */
+       mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
+
+       /*
          * Register the x86 notifier only once
          * with MCE subsystem.
          */
author	Mukul Joshi <mukul.joshi@amd.com>
	Tue, 21 Sep 2021 00:48:23 +0000 (20:48 -0400)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 13 Oct 2021 18:14:48 +0000 (14:14 -0400)