drm/amdgpu: reset VM when an error is detected
[platform/kernel/linux-rpi.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_vm.c
index 3c03105..1045be4 100644 (file)
@@ -267,6 +267,32 @@ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
 }
 
 /**
+ * amdgpu_vm_bo_reset_state_machine - reset the vm_bo state machine
+ * @vm: the VM which state machine to reset
+ *
+ * Move all vm_bo object in the VM into a state where they will be updated
+ * again during validation.
+ */
+static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
+{
+       struct amdgpu_vm_bo_base *vm_bo, *tmp;
+
+       spin_lock(&vm->status_lock);
+       list_splice_init(&vm->done, &vm->invalidated);
+       list_for_each_entry(vm_bo, &vm->invalidated, vm_status)
+               vm_bo->moved = true;
+       list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) {
+               struct amdgpu_bo *bo = vm_bo->bo;
+
+               if (!bo || bo->tbo.type != ttm_bo_type_kernel)
+                       list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
+               else if (bo->parent)
+                       list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
+       }
+       spin_unlock(&vm->status_lock);
+}
+
+/**
  * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
  *
  * @base: base structure for tracking BO usage in a VM
@@ -351,6 +377,34 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
        spin_unlock(&adev->mman.bdev.lru_lock);
 }
 
+/* Create scheduler entities for page table updates */
+static int amdgpu_vm_init_entities(struct amdgpu_device *adev,
+                                  struct amdgpu_vm *vm)
+{
+       int r;
+
+       r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
+                                 adev->vm_manager.vm_pte_scheds,
+                                 adev->vm_manager.vm_pte_num_scheds, NULL);
+       if (r)
+               goto error;
+
+       return drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
+                                    adev->vm_manager.vm_pte_scheds,
+                                    adev->vm_manager.vm_pte_num_scheds, NULL);
+
+error:
+       drm_sched_entity_destroy(&vm->immediate);
+       return r;
+}
+
+/* Destroy the entities for page table updates again */
+static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm)
+{
+       drm_sched_entity_destroy(&vm->immediate);
+       drm_sched_entity_destroy(&vm->delayed);
+}
+
 /**
  * amdgpu_vm_validate_pt_bos - validate the page table BOs
  *
@@ -373,6 +427,14 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
        struct amdgpu_bo *bo;
        int r;
 
+       if (drm_sched_entity_error(&vm->delayed)) {
+               amdgpu_vm_bo_reset_state_machine(vm);
+               amdgpu_vm_fini_entities(vm);
+               r = amdgpu_vm_init_entities(adev, vm);
+               if (r)
+                       return r;
+       }
+
        spin_lock(&vm->status_lock);
        while (!list_empty(&vm->evicted)) {
                bo_base = list_first_entry(&vm->evicted,
@@ -920,42 +982,51 @@ error_unlock:
        return r;
 }
 
+static void amdgpu_vm_bo_get_memory(struct amdgpu_bo_va *bo_va,
+                                   struct amdgpu_mem_stats *stats)
+{
+       struct amdgpu_vm *vm = bo_va->base.vm;
+       struct amdgpu_bo *bo = bo_va->base.bo;
+
+       if (!bo)
+               return;
+
+       /*
+        * For now ignore BOs which are currently locked and potentially
+        * changing their location.
+        */
+       if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv &&
+           !dma_resv_trylock(bo->tbo.base.resv))
+               return;
+
+       amdgpu_bo_get_memory(bo, stats);
+       if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
+           dma_resv_unlock(bo->tbo.base.resv);
+}
+
 void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
                          struct amdgpu_mem_stats *stats)
 {
        struct amdgpu_bo_va *bo_va, *tmp;
 
        spin_lock(&vm->status_lock);
-       list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) {
-               if (!bo_va->base.bo)
-                       continue;
-               amdgpu_bo_get_memory(bo_va->base.bo, stats);
-       }
-       list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) {
-               if (!bo_va->base.bo)
-                       continue;
-               amdgpu_bo_get_memory(bo_va->base.bo, stats);
-       }
-       list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) {
-               if (!bo_va->base.bo)
-                       continue;
-               amdgpu_bo_get_memory(bo_va->base.bo, stats);
-       }
-       list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) {
-               if (!bo_va->base.bo)
-                       continue;
-               amdgpu_bo_get_memory(bo_va->base.bo, stats);
-       }
-       list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {
-               if (!bo_va->base.bo)
-                       continue;
-               amdgpu_bo_get_memory(bo_va->base.bo, stats);
-       }
-       list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status) {
-               if (!bo_va->base.bo)
-                       continue;
-               amdgpu_bo_get_memory(bo_va->base.bo, stats);
-       }
+       list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status)
+               amdgpu_vm_bo_get_memory(bo_va, stats);
+
+       list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status)
+               amdgpu_vm_bo_get_memory(bo_va, stats);
+
+       list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status)
+               amdgpu_vm_bo_get_memory(bo_va, stats);
+
+       list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status)
+               amdgpu_vm_bo_get_memory(bo_va, stats);
+
+       list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status)
+               amdgpu_vm_bo_get_memory(bo_va, stats);
+
+       list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status)
+               amdgpu_vm_bo_get_memory(bo_va, stats);
        spin_unlock(&vm->status_lock);
 }
 
@@ -1358,6 +1429,7 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
        amdgpu_vm_bo_base_init(&bo_va->base, vm, bo);
 
        bo_va->ref_count = 1;
+       bo_va->last_pt_update = dma_fence_get_stub();
        INIT_LIST_HEAD(&bo_va->valids);
        INIT_LIST_HEAD(&bo_va->invalids);
 
@@ -1433,14 +1505,14 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
        uint64_t eaddr;
 
        /* validate the parameters */
-       if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK ||
-           size == 0 || size & ~PAGE_MASK)
+       if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK || size & ~PAGE_MASK)
+               return -EINVAL;
+       if (saddr + size <= saddr || offset + size <= offset)
                return -EINVAL;
 
        /* make sure object fit at this offset */
        eaddr = saddr + size - 1;
-       if (saddr >= eaddr ||
-           (bo && offset + size > amdgpu_bo_size(bo)) ||
+       if ((bo && offset + size > amdgpu_bo_size(bo)) ||
            (eaddr >= adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT))
                return -EINVAL;
 
@@ -1499,14 +1571,14 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
        int r;
 
        /* validate the parameters */
-       if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK ||
-           size == 0 || size & ~PAGE_MASK)
+       if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK || size & ~PAGE_MASK)
+               return -EINVAL;
+       if (saddr + size <= saddr || offset + size <= offset)
                return -EINVAL;
 
        /* make sure object fit at this offset */
        eaddr = saddr + size - 1;
-       if (saddr >= eaddr ||
-           (bo && offset + size > amdgpu_bo_size(bo)) ||
+       if ((bo && offset + size > amdgpu_bo_size(bo)) ||
            (eaddr >= adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT))
                return -EINVAL;
 
@@ -2038,19 +2110,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
        INIT_LIST_HEAD(&vm->pt_freed);
        INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
 
-       /* create scheduler entities for page table updates */
-       r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
-                                 adev->vm_manager.vm_pte_scheds,
-                                 adev->vm_manager.vm_pte_num_scheds, NULL);
+       r = amdgpu_vm_init_entities(adev, vm);
        if (r)
                return r;
 
-       r = drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
-                                 adev->vm_manager.vm_pte_scheds,
-                                 adev->vm_manager.vm_pte_num_scheds, NULL);
-       if (r)
-               goto error_free_immediate;
-
        vm->pte_support_ats = false;
        vm->is_compute_context = false;
 
@@ -2067,7 +2130,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
                vm->update_funcs = &amdgpu_vm_cpu_funcs;
        else
                vm->update_funcs = &amdgpu_vm_sdma_funcs;
-       vm->last_update = NULL;
+
+       vm->last_update = dma_fence_get_stub();
        vm->last_unlocked = dma_fence_get_stub();
        vm->last_tlb_flush = dma_fence_get_stub();
 
@@ -2110,10 +2174,7 @@ error_free_root:
 error_free_delayed:
        dma_fence_put(vm->last_tlb_flush);
        dma_fence_put(vm->last_unlocked);
-       drm_sched_entity_destroy(&vm->delayed);
-
-error_free_immediate:
-       drm_sched_entity_destroy(&vm->immediate);
+       amdgpu_vm_fini_entities(vm);
 
        return r;
 }
@@ -2192,7 +2253,7 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
                goto unreserve_bo;
 
        dma_fence_put(vm->last_update);
-       vm->last_update = NULL;
+       vm->last_update = dma_fence_get_stub();
        vm->is_compute_context = true;
 
        /* Free the shadow bo for compute VM */
@@ -2266,8 +2327,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
        amdgpu_bo_unref(&root);
        WARN_ON(vm->root.bo);
 
-       drm_sched_entity_destroy(&vm->immediate);
-       drm_sched_entity_destroy(&vm->delayed);
+       amdgpu_vm_fini_entities(vm);
 
        if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
                dev_err(adev->dev, "still active bo inside vm\n");
@@ -2282,8 +2342,14 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
        }
 
        dma_fence_put(vm->last_update);
-       for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
-               amdgpu_vmid_free_reserved(adev, vm, i);
+
+       for (i = 0; i < AMDGPU_MAX_VMHUBS; i++) {
+               if (vm->reserved_vmid[i]) {
+                       amdgpu_vmid_free_reserved(adev, i);
+                       vm->reserved_vmid[i] = false;
+               }
+       }
+
 }
 
 /**
@@ -2366,18 +2432,25 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
        union drm_amdgpu_vm *args = data;
        struct amdgpu_device *adev = drm_to_adev(dev);
        struct amdgpu_fpriv *fpriv = filp->driver_priv;
-       int r;
+
+       /* No valid flags defined yet */
+       if (args->in.flags)
+               return -EINVAL;
 
        switch (args->in.op) {
        case AMDGPU_VM_OP_RESERVE_VMID:
                /* We only have requirement to reserve vmid from gfxhub */
-               r = amdgpu_vmid_alloc_reserved(adev, &fpriv->vm,
-                                              AMDGPU_GFXHUB_0);
-               if (r)
-                       return r;
+               if (!fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)]) {
+                       amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(0));
+                       fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)] = true;
+               }
+
                break;
        case AMDGPU_VM_OP_UNRESERVE_VMID:
-               amdgpu_vmid_free_reserved(adev, &fpriv->vm, AMDGPU_GFXHUB_0);
+               if (fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)]) {
+                       amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(0));
+                       fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)] = false;
+               }
                break;
        default:
                return -EINVAL;
@@ -2432,6 +2505,9 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
  * amdgpu_vm_handle_fault - graceful handling of VM faults.
  * @adev: amdgpu device pointer
  * @pasid: PASID of the VM
+ * @vmid: VMID, only used for GFX 9.4.3.
+ * @node_id: Node_id received in IH cookie. Only applicable for
+ *           GFX 9.4.3.
  * @addr: Address of the fault
  * @write_fault: true is write fault, false is read fault
  *
@@ -2439,7 +2515,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-                           uint64_t addr, bool write_fault)
+                           u32 vmid, u32 node_id, uint64_t addr,
+                           bool write_fault)
 {
        bool is_compute_context = false;
        struct amdgpu_bo *root;
@@ -2463,8 +2540,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
 
        addr /= AMDGPU_GPU_PAGE_SIZE;
 
-       if (is_compute_context &&
-           !svm_range_restore_pages(adev, pasid, addr, write_fault)) {
+       if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
+           node_id, addr, write_fault)) {
                amdgpu_bo_unref(&root);
                return true;
        }