drm/amdgpu: message smu to update bad channel info

author Stanley.Yang <Stanley.Yang@amd.com>

Thu, 3 Mar 2022 09:56:33 +0000 (17:56 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 15 Mar 2022 18:25:16 +0000 (14:25 -0400)
author Stanley.Yang <Stanley.Yang@amd.com>
Thu, 3 Mar 2022 09:56:33 +0000 (17:56 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 15 Mar 2022 18:25:16 +0000 (14:25 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index d78c297..424c22a 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         mutex_init(&con->recovery_lock);
         INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
         atomic_set(&con->in_recovery, 0);
+       con->eeprom_control.bad_channel_bitmap = 0;
  
         max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
         amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
@@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
                         goto free;
  
                 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
+
+               if (con->update_channel_flag == true) {
+                       amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
+                       con->update_channel_flag = false;
+               }
         }
  
  #ifdef CONFIG_X86_MCE_AMD
@@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                 goto release_con;
         }
  
+       con->update_channel_flag = false;
         con->features = 0;
         INIT_LIST_HEAD(&con->head);
         /* Might need get this flag from vbios. */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index 7cddaad..9314fde 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -374,6 +374,9 @@ struct amdgpu_ras {
  
         /* record umc error info queried from smu */
         struct umc_ecc_info umc_ecc;
+
+       /* Indicates smu whether need update bad channel info */
+       bool update_channel_flag;
  };
  
  struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

index a44f2ee..c428398 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -267,6 +267,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
  {
         struct amdgpu_device *adev = to_amdgpu_device(control);
         struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         u8 csum;
         int res;
  
@@ -287,6 +288,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
  
         amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
  
+       control->bad_channel_bitmap = 0;
+       amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
+       con->update_channel_flag = false;
+
         amdgpu_ras_debugfs_set_ret_size(control);
  
         mutex_unlock(&control->ras_tbl_mutex);
@@ -420,6 +425,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
                                struct eeprom_table_record *record,
                                const u32 num)
  {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
         u32 a, b, i;
         u8 *buf, *pp;
         int res;
@@ -431,9 +437,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
         /* Encode all of them in one go.
          */
         pp = buf;
-       for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
+       for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
                 __encode_table_record_to_buf(control, &record[i], pp);
  
+               /* update bad channel bitmap */
+               if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+                       control->bad_channel_bitmap |= 1 << record[i].mem_channel;
+                       con->update_channel_flag = true;
+               }
+       }
+
         /* a, first record index to write into.
          * b, last record index to write into.
          * a = first index to read (fri) + number of records in the table,
@@ -686,6 +699,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
                            const u32 num)
  {
         struct amdgpu_device *adev = to_amdgpu_device(control);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         int i, res;
         u8 *buf, *pp;
         u32 g0, g1;
@@ -753,8 +767,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
         /* Read up everything? Then transform.
          */
         pp = buf;
-       for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
+       for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
                 __decode_table_record_from_buf(control, &record[i], pp);
+
+               /* update bad channel bitmap */
+               if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+                       control->bad_channel_bitmap |= 1 << record[i].mem_channel;
+                       con->update_channel_flag = true;
+               }
+       }
  Out:
         kfree(buf);
         mutex_unlock(&control->ras_tbl_mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index 6bb0057..54d9bfe 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
         /* Protect table access via this mutex.
          */
         struct mutex ras_tbl_mutex;
+
+       /* Record channel info which occurred bad pages
+        */
+       u32 bad_channel_bitmap;
  };
  
  /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

index 85da6cb..aad3c8b 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
                         amdgpu_ras_save_bad_pages(adev);
  
                         amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
+
+                       if (con->update_channel_flag == true) {
+                               amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
+                               con->update_channel_flag = false;
+                       }
                 }
  
                 if (reset)
author	Stanley.Yang <Stanley.Yang@amd.com>
	Thu, 3 Mar 2022 09:56:33 +0000 (17:56 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 15 Mar 2022 18:25:16 +0000 (14:25 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c		patch \| blob \| history