drm/amdgpu: break driver init process when it's bad GPU(v5)

author Guchun Chen <guchun.chen@amd.com>

Thu, 23 Jul 2020 07:42:19 +0000 (15:42 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 4 Aug 2020 21:26:31 +0000 (17:26 -0400)
author Guchun Chen <guchun.chen@amd.com>
Thu, 23 Jul 2020 07:42:19 +0000 (15:42 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 4 Aug 2020 21:26:31 +0000 (17:26 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 40caa74..905c5ab 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2055,13 +2055,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
          * it should be called after amdgpu_device_ip_hw_init_phase2  since
          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
          * for I2C communication which only true at this point.
-        * recovery_init may fail, but it can free all resources allocated by
-        * itself and its failure should not stop amdgpu init process.
+        *
+        * amdgpu_ras_recovery_init may fail, but the upper only cares the
+        * failure from bad gpu situation and stop amdgpu init process
+        * accordingly. For other failed cases, it will still release all
+        * the resource and print error message, rather than returning one
+        * negative value to upper level.
          *
          * Note: theoretically, this should be called before all vram allocations
          * to protect retired page from abusing
          */
-       amdgpu_ras_recovery_init(adev);
+       r = amdgpu_ras_recovery_init(adev);
+       if (r)
+               goto init_failed;
  
         if (adev->gmc.xgmi.num_physical_nodes > 1)
                 amdgpu_xgmi_add_device(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 6660094..1a1652e 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1821,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_err_handler_data **data;
         uint32_t max_eeprom_records_len = 0;
+       bool exc_err_limit = false;
         int ret;
  
         if (con)
@@ -1842,8 +1843,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
         amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
  
-       ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
-       if (ret)
+       ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
+       /*
+        * This calling fails when exc_err_limit is true or
+        * ret != 0.
+        */
+       if (exc_err_limit || ret)
                 goto free;
  
         if (con->eeprom_control.num_recs) {
@@ -1867,6 +1872,15 @@ free:
  out:
         dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
  
+       /*
+        * Except error threshold exceeding case, other failure cases in this
+        * function would not fail amdgpu driver init.
+        */
+       if (!exc_err_limit)
+               ret = 0;
+       else
+               ret = -EINVAL;
+
         return ret;
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

index 35c0c84..da8b35a 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
  
  }
  
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+                       bool *exceed_err_limit)
  {
         int ret = 0;
         struct amdgpu_device *adev = to_amdgpu_device(control);
@@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
                         .buf    = buff,
         };
  
+       *exceed_err_limit = false;
+
         /* Verify i2c adapter is initialized */
         if (!adev->pm.smu_i2c.algo)
                 return -ENOENT;
@@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
                 DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
                                  control->num_recs);
  
+       } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
+                       (amdgpu_bad_page_threshold != 0)) {
+               *exceed_err_limit = true;
+               dev_err(adev->dev, "Exceeding the bad_page_threshold parameter, "
+                               "disabling the GPU.\n");
         } else {
                 DRM_INFO("Creating new EEPROM table");
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index e285e8c..9839b4e 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -76,7 +76,8 @@ struct eeprom_table_record {
         unsigned char mcumc_id;
  }__attribute__((__packed__));
  
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
+                       bool *exceed_err_limit);
  int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
  
  int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
author	Guchun Chen <guchun.chen@amd.com>
	Thu, 23 Jul 2020 07:42:19 +0000 (15:42 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 4 Aug 2020 21:26:31 +0000 (17:26 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h		patch \| blob \| history