habanalabs: extend process wait timeout in device fine
authorOded Gabbay <ogabbay@kernel.org>
Thu, 10 Nov 2022 15:24:02 +0000 (17:24 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Wed, 23 Nov 2022 14:13:48 +0000 (16:13 +0200)
Processes that use our device are likely to use at the same time other
devices such as remote storage.

In case our device is removed and a user process is still using the
device, we need to kill the user process. However, if that process
has a thread waiting for i/o to complete on remote storage, for example,
the process won't terminate.

Let's give it enough time to terminate before giving up.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
drivers/misc/habanalabs/common/device.c
drivers/misc/habanalabs/common/habanalabs.h

index 0650e51..63d0cb7 100644 (file)
@@ -2300,14 +2300,16 @@ void hl_device_fini(struct hl_device *hdev)
         */
        dev_info(hdev->dev,
                "Waiting for all processes to exit (timeout of %u seconds)",
-               HL_PENDING_RESET_LONG_SEC);
+               HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI);
 
-       rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC, false);
+       hdev->process_kill_trial_cnt = 0;
+       rc = device_kill_open_processes(hdev, HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI, false);
        if (rc) {
                dev_crit(hdev->dev, "Failed to kill all open processes\n");
                device_disable_open_processes(hdev, false);
        }
 
+       hdev->process_kill_trial_cnt = 0;
        rc = device_kill_open_processes(hdev, 0, true);
        if (rc) {
                dev_crit(hdev->dev, "Failed to kill all control device open processes\n");
index 0781b86..e7f8986 100644 (file)
@@ -50,9 +50,14 @@ struct hl_fpriv;
 #define HL_MMAP_OFFSET_VALUE_MASK      (0x1FFFFFFFFFFFull >> PAGE_SHIFT)
 #define HL_MMAP_OFFSET_VALUE_GET(off)  (off & HL_MMAP_OFFSET_VALUE_MASK)
 
-#define HL_PENDING_RESET_PER_SEC       10
-#define HL_PENDING_RESET_MAX_TRIALS    60 /* 10 minutes */
-#define HL_PENDING_RESET_LONG_SEC      60
+#define HL_PENDING_RESET_PER_SEC               10
+#define HL_PENDING_RESET_MAX_TRIALS            60 /* 10 minutes */
+#define HL_PENDING_RESET_LONG_SEC              60
+/*
+ * In device fini, wait 10 minutes for user processes to be terminated after we kill them.
+ * This is needed to prevent situation of clearing resources while user processes are still alive.
+ */
+#define HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI    600
 
 #define HL_HARD_RESET_MAX_TIMEOUT      120
 #define HL_PLDM_HARD_RESET_MAX_TIMEOUT (HL_HARD_RESET_MAX_TIMEOUT * 3)