habanalabs: all FD must be closed before removing device

author Oded Gabbay <oded.gabbay@gmail.com>

Sat, 6 Apr 2019 10:23:54 +0000 (13:23 +0300)

committer Oded Gabbay <oded.gabbay@gmail.com>

Sat, 6 Apr 2019 10:23:54 +0000 (13:23 +0300)
author Oded Gabbay <oded.gabbay@gmail.com>
Sat, 6 Apr 2019 10:23:54 +0000 (13:23 +0300)
committer Oded Gabbay <oded.gabbay@gmail.com>
Sat, 6 Apr 2019 10:23:54 +0000 (13:23 +0300)
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c

index 6cbfd56..56d3ba5 100644 (file)
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -513,11 +513,8 @@ disable_device:
         return rc;
  }
  
-static void hl_device_hard_reset_pending(struct work_struct *work)
+static void device_kill_open_processes(struct hl_device *hdev)
  {
-       struct hl_device_reset_work *device_reset_work =
-               container_of(work, struct hl_device_reset_work, reset_work);
-       struct hl_device *hdev = device_reset_work->hdev;
         u16 pending_total, pending_cnt;
         struct task_struct *task = NULL;
  
@@ -552,6 +549,12 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
                 }
         }
  
+       /* We killed the open users, but because the driver cleans up after the
+        * user contexts are closed (e.g. mmu mappings), we need to wait again
+        * to make sure the cleaning phase is finished before continuing with
+        * the reset
+        */
+
         pending_cnt = pending_total;
  
         while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {
@@ -567,6 +570,16 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
  
         mutex_unlock(&hdev->fd_open_cnt_lock);
  
+}
+
+static void device_hard_reset_pending(struct work_struct *work)
+{
+       struct hl_device_reset_work *device_reset_work =
+               container_of(work, struct hl_device_reset_work, reset_work);
+       struct hl_device *hdev = device_reset_work->hdev;
+
+       device_kill_open_processes(hdev);
+
         hl_device_reset(hdev, true, true);
  
         kfree(device_reset_work);
@@ -650,7 +663,7 @@ again:
                  * from a dedicated work
                  */
                 INIT_WORK(&device_reset_work->reset_work,
-                               hl_device_hard_reset_pending);
+                               device_hard_reset_pending);
                 device_reset_work->hdev = hdev;
                 schedule_work(&device_reset_work->reset_work);
  
@@ -1049,6 +1062,15 @@ void hl_device_fini(struct hl_device *hdev)
         /* Mark device as disabled */
         hdev->disabled = true;
  
+       /*
+        * Flush anyone that is inside the critical section of enqueue
+        * jobs to the H/W
+        */
+       hdev->asic_funcs->hw_queues_lock(hdev);
+       hdev->asic_funcs->hw_queues_unlock(hdev);
+
+       device_kill_open_processes(hdev);
+
         hl_hwmon_fini(hdev);
  
         device_late_fini(hdev);
author	Oded Gabbay <oded.gabbay@gmail.com>
	Sat, 6 Apr 2019 10:23:54 +0000 (13:23 +0300)
committer	Oded Gabbay <oded.gabbay@gmail.com>
	Sat, 6 Apr 2019 10:23:54 +0000 (13:23 +0300)