drm/amdgpu: message smu to update bad channel info
authorStanley.Yang <Stanley.Yang@amd.com>
Thu, 3 Mar 2022 09:56:33 +0000 (17:56 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 15 Mar 2022 18:25:16 +0000 (14:25 -0400)
It should notice SMU to update bad channel info when detected
uncorrectable error in UMC block

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

index d78c297..424c22a 100644 (file)
@@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
        mutex_init(&con->recovery_lock);
        INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
        atomic_set(&con->in_recovery, 0);
+       con->eeprom_control.bad_channel_bitmap = 0;
 
        max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
        amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
@@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
                        goto free;
 
                amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
+
+               if (con->update_channel_flag == true) {
+                       amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
+                       con->update_channel_flag = false;
+               }
        }
 
 #ifdef CONFIG_X86_MCE_AMD
@@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                goto release_con;
        }
 
+       con->update_channel_flag = false;
        con->features = 0;
        INIT_LIST_HEAD(&con->head);
        /* Might need get this flag from vbios. */
index 7cddaad..9314fde 100644 (file)
@@ -374,6 +374,9 @@ struct amdgpu_ras {
 
        /* record umc error info queried from smu */
        struct umc_ecc_info umc_ecc;
+
+       /* Indicates smu whether need update bad channel info */
+       bool update_channel_flag;
 };
 
 struct ras_fs_data {
index a44f2ee..c428398 100644 (file)
@@ -267,6 +267,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 {
        struct amdgpu_device *adev = to_amdgpu_device(control);
        struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        u8 csum;
        int res;
 
@@ -287,6 +288,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 
        amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
 
+       control->bad_channel_bitmap = 0;
+       amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
+       con->update_channel_flag = false;
+
        amdgpu_ras_debugfs_set_ret_size(control);
 
        mutex_unlock(&control->ras_tbl_mutex);
@@ -420,6 +425,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
                               struct eeprom_table_record *record,
                               const u32 num)
 {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
        u32 a, b, i;
        u8 *buf, *pp;
        int res;
@@ -431,9 +437,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
        /* Encode all of them in one go.
         */
        pp = buf;
-       for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
+       for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
                __encode_table_record_to_buf(control, &record[i], pp);
 
+               /* update bad channel bitmap */
+               if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+                       control->bad_channel_bitmap |= 1 << record[i].mem_channel;
+                       con->update_channel_flag = true;
+               }
+       }
+
        /* a, first record index to write into.
         * b, last record index to write into.
         * a = first index to read (fri) + number of records in the table,
@@ -686,6 +699,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
                           const u32 num)
 {
        struct amdgpu_device *adev = to_amdgpu_device(control);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        int i, res;
        u8 *buf, *pp;
        u32 g0, g1;
@@ -753,8 +767,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
        /* Read up everything? Then transform.
         */
        pp = buf;
-       for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
+       for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
                __decode_table_record_from_buf(control, &record[i], pp);
+
+               /* update bad channel bitmap */
+               if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+                       control->bad_channel_bitmap |= 1 << record[i].mem_channel;
+                       con->update_channel_flag = true;
+               }
+       }
 Out:
        kfree(buf);
        mutex_unlock(&control->ras_tbl_mutex);
index 6bb0057..54d9bfe 100644 (file)
@@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
        /* Protect table access via this mutex.
         */
        struct mutex ras_tbl_mutex;
+
+       /* Record channel info which occurred bad pages
+        */
+       u32 bad_channel_bitmap;
 };
 
 /*
index 85da6cb..aad3c8b 100644 (file)
@@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
                        amdgpu_ras_save_bad_pages(adev);
 
                        amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
+
+                       if (con->update_channel_flag == true) {
+                               amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
+                               con->update_channel_flag = false;
+                       }
                }
 
                if (reset)