From f89b881c81d9a6481fc17b46b351ca38f5dd6f3a Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Mon, 22 Feb 2021 18:22:57 +0800 Subject: [PATCH] drm/amdgpu: reserve backup pages for bad page retirment To ensure user has a constant of VRAM accessible in run-time, driver reserves limit backup pages when init, and return ones when bad pages retired, to keep no change of unused memory size. v2: refine codes to calculate badpags threshold Reviewed-by: Hawking Zhang Signed-off-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 +++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 4 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 96 +++++++++++++++++++++++++++- 4 files changed, 118 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 4575192..4557f47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -171,7 +171,7 @@ struct amdgpu_mgpu_info mgpu_info = { }; int amdgpu_ras_enable = -1; uint amdgpu_ras_mask = 0xffffffff; -int amdgpu_bad_page_threshold = -1; +int amdgpu_bad_page_threshold = 100; /** * DOC: vramlimit (int) @@ -805,7 +805,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444); * faulty pages by ECC exceed threshold value and leave it for user's further * check. */ -MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)"); +MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value"); module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 93699ea..09546de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1747,13 +1747,14 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, return ret; } -static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, - uint32_t max_length) +static uint32_t +amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int tmp_threshold = amdgpu_bad_page_threshold; u64 val; + uint32_t max_length = 0; + max_length = amdgpu_ras_eeprom_get_record_max_length(); /* * Justification of value bad_page_cnt_threshold in ras structure * @@ -1779,20 +1780,18 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, tmp_threshold = max_length; if (tmp_threshold == -1) { - val = adev->gmc.mc_vram_size; + val = adev->gmc.real_vram_size; do_div(val, RAS_BAD_PAGE_RATE); - con->bad_page_cnt_threshold = min(lower_32_bits(val), - max_length); - } else { - con->bad_page_cnt_threshold = tmp_threshold; + tmp_threshold = min(lower_32_bits(val), max_length); } + + return tmp_threshold; } int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; - uint32_t max_eeprom_records_len = 0; bool exc_err_limit = false; int ret; @@ -1812,8 +1811,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; - max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); - amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + if (!con->bad_page_cnt_threshold) { + con->bad_page_cnt_threshold = + amdgpu_ras_calculate_badpags_threshold(adev); + + ret = amdgpu_vram_mgr_reserve_backup_pages( + ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), + con->bad_page_cnt_threshold); + if (ret) + goto out; + } ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index a4ca34a..1eee654 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -48,6 +48,8 @@ struct amdgpu_vram_mgr { spinlock_t lock; struct list_head reservations_pending; struct list_head reserved_pages; + struct list_head backup_pages; + uint32_t num_backup_pages; atomic64_t usage; atomic64_t vis_usage; }; @@ -123,6 +125,8 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man); uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man); int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man, uint64_t start, uint64_t size); +int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man, + uint32_t num_pages); int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man, uint64_t start); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index c89b66b..de7db14 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -28,6 +28,9 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr, + uint32_t num_pages); + static inline struct amdgpu_vram_mgr *to_vram_mgr(struct ttm_resource_manager *man) { return container_of(man, struct amdgpu_vram_mgr, manager); @@ -186,6 +189,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev) spin_lock_init(&mgr->lock); INIT_LIST_HEAD(&mgr->reservations_pending); INIT_LIST_HEAD(&mgr->reserved_pages); + INIT_LIST_HEAD(&mgr->backup_pages); /* Add the two VRAM-related sysfs files */ ret = sysfs_create_files(&adev->dev->kobj, amdgpu_vram_mgr_attributes); @@ -226,6 +230,11 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) drm_mm_remove_node(&rsv->mm_node); kfree(rsv); } + + list_for_each_entry_safe(rsv, temp, &mgr->backup_pages, node) { + drm_mm_remove_node(&rsv->mm_node); + kfree(rsv); + } drm_mm_takedown(&mgr->mm); spin_unlock(&mgr->lock); @@ -297,12 +306,14 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man) continue; dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n", - rsv->mm_node.start, rsv->mm_node.size); + rsv->mm_node.start << PAGE_SHIFT, rsv->mm_node.size); vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node); atomic64_add(vis_usage, &mgr->vis_usage); atomic64_add(rsv->mm_node.size << PAGE_SHIFT, &mgr->usage); list_move(&rsv->node, &mgr->reserved_pages); + + amdgpu_vram_mgr_free_backup_pages(mgr, rsv->mm_node.size); } } @@ -319,6 +330,7 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man, uint64_t start, uint64_t size) { struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); + struct amdgpu_device *adev = to_amdgpu_device(mgr); struct amdgpu_vram_reservation *rsv; rsv = kzalloc(sizeof(*rsv), GFP_KERNEL); @@ -329,14 +341,94 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man, rsv->mm_node.start = start >> PAGE_SHIFT; rsv->mm_node.size = size >> PAGE_SHIFT; + dev_dbg(adev->dev, "Pending Reservation: 0x%llx\n", start); + spin_lock(&mgr->lock); - list_add_tail(&mgr->reservations_pending, &rsv->node); + list_add_tail(&rsv->node, &mgr->reservations_pending); amdgpu_vram_mgr_do_reserve(man); spin_unlock(&mgr->lock); return 0; } +static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr, + uint32_t num_pages) +{ + struct amdgpu_device *adev = to_amdgpu_device(mgr); + struct amdgpu_vram_reservation *rsv; + uint32_t i; + uint64_t vis_usage = 0, total_usage = 0; + + if (num_pages > mgr->num_backup_pages) { + dev_warn(adev->dev, "No enough backup pages\n"); + return -EINVAL; + } + + for (i = 0; i < num_pages; i++) { + rsv = list_first_entry(&mgr->backup_pages, + struct amdgpu_vram_reservation, node); + vis_usage += amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node); + total_usage += (rsv->mm_node.size << PAGE_SHIFT); + drm_mm_remove_node(&rsv->mm_node); + list_del(&rsv->node); + kfree(rsv); + mgr->num_backup_pages--; + } + + atomic64_sub(total_usage, &mgr->usage); + atomic64_sub(vis_usage, &mgr->vis_usage); + + return 0; +} + +int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man, + uint32_t num_pages) +{ + struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); + struct amdgpu_device *adev = to_amdgpu_device(mgr); + struct amdgpu_vram_reservation *rsv; + struct drm_mm *mm = &mgr->mm; + uint32_t i; + int ret = 0; + uint64_t vis_usage, total_usage; + + for (i = 0; i < num_pages; i++) { + rsv = kzalloc(sizeof(*rsv), GFP_KERNEL); + if (!rsv) { + ret = -ENOMEM; + goto pro_end; + } + + INIT_LIST_HEAD(&rsv->node); + + ret = drm_mm_insert_node(mm, &rsv->mm_node, 1); + if (ret) { + dev_err(adev->dev, "failed to reserve backup page %d, ret 0x%x\n", i, ret); + kfree(rsv); + goto pro_end; + } + + vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node); + total_usage = (rsv->mm_node.size << PAGE_SHIFT); + + spin_lock(&mgr->lock); + atomic64_add(vis_usage, &mgr->vis_usage); + atomic64_add(total_usage, &mgr->usage); + list_add_tail(&rsv->node, &mgr->backup_pages); + mgr->num_backup_pages++; + spin_unlock(&mgr->lock); + } + +pro_end: + if (ret) { + spin_lock(&mgr->lock); + amdgpu_vram_mgr_free_backup_pages(mgr, mgr->num_backup_pages); + spin_unlock(&mgr->lock); + } + + return ret; +} + /** * amdgpu_vram_mgr_query_page_status - query the reservation status * -- 2.7.4