From 55bf196f60dfc89488c5645d112a9176c6fe4708 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20K=C3=B6nig?= Date: Tue, 18 Apr 2023 13:18:01 +0200 Subject: [PATCH] drm/amdgpu: reset VM when an error is detected MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When some problem with the updates of page tables is detected reset the state machine of the VM and re-create all page tables from scratch. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 81 +++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 5dcc800..1045be4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -267,6 +267,32 @@ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo) } /** + * amdgpu_vm_bo_reset_state_machine - reset the vm_bo state machine + * @vm: the VM which state machine to reset + * + * Move all vm_bo object in the VM into a state where they will be updated + * again during validation. + */ +static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm) +{ + struct amdgpu_vm_bo_base *vm_bo, *tmp; + + spin_lock(&vm->status_lock); + list_splice_init(&vm->done, &vm->invalidated); + list_for_each_entry(vm_bo, &vm->invalidated, vm_status) + vm_bo->moved = true; + list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) { + struct amdgpu_bo *bo = vm_bo->bo; + + if (!bo || bo->tbo.type != ttm_bo_type_kernel) + list_move(&vm_bo->vm_status, &vm_bo->vm->moved); + else if (bo->parent) + list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); + } + spin_unlock(&vm->status_lock); +} + +/** * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm * * @base: base structure for tracking BO usage in a VM @@ -351,6 +377,34 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, spin_unlock(&adev->mman.bdev.lru_lock); } +/* Create scheduler entities for page table updates */ +static int amdgpu_vm_init_entities(struct amdgpu_device *adev, + struct amdgpu_vm *vm) +{ + int r; + + r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL, + adev->vm_manager.vm_pte_scheds, + adev->vm_manager.vm_pte_num_scheds, NULL); + if (r) + goto error; + + return drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL, + adev->vm_manager.vm_pte_scheds, + adev->vm_manager.vm_pte_num_scheds, NULL); + +error: + drm_sched_entity_destroy(&vm->immediate); + return r; +} + +/* Destroy the entities for page table updates again */ +static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm) +{ + drm_sched_entity_destroy(&vm->immediate); + drm_sched_entity_destroy(&vm->delayed); +} + /** * amdgpu_vm_validate_pt_bos - validate the page table BOs * @@ -373,6 +427,14 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct amdgpu_bo *bo; int r; + if (drm_sched_entity_error(&vm->delayed)) { + amdgpu_vm_bo_reset_state_machine(vm); + amdgpu_vm_fini_entities(vm); + r = amdgpu_vm_init_entities(adev, vm); + if (r) + return r; + } + spin_lock(&vm->status_lock); while (!list_empty(&vm->evicted)) { bo_base = list_first_entry(&vm->evicted, @@ -2048,19 +2110,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm) INIT_LIST_HEAD(&vm->pt_freed); INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work); - /* create scheduler entities for page table updates */ - r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL, - adev->vm_manager.vm_pte_scheds, - adev->vm_manager.vm_pte_num_scheds, NULL); + r = amdgpu_vm_init_entities(adev, vm); if (r) return r; - r = drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL, - adev->vm_manager.vm_pte_scheds, - adev->vm_manager.vm_pte_num_scheds, NULL); - if (r) - goto error_free_immediate; - vm->pte_support_ats = false; vm->is_compute_context = false; @@ -2121,10 +2174,7 @@ error_free_root: error_free_delayed: dma_fence_put(vm->last_tlb_flush); dma_fence_put(vm->last_unlocked); - drm_sched_entity_destroy(&vm->delayed); - -error_free_immediate: - drm_sched_entity_destroy(&vm->immediate); + amdgpu_vm_fini_entities(vm); return r; } @@ -2277,8 +2327,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) amdgpu_bo_unref(&root); WARN_ON(vm->root.bo); - drm_sched_entity_destroy(&vm->immediate); - drm_sched_entity_destroy(&vm->delayed); + amdgpu_vm_fini_entities(vm); if (!RB_EMPTY_ROOT(&vm->va.rb_root)) { dev_err(adev->dev, "still active bo inside vm\n"); -- 2.7.4