habanalabs: reset device upon FD close if not idle
authorOfir Bitton <obitton@habana.ai>
Thu, 20 May 2021 10:30:31 +0000 (13:30 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Fri, 18 Jun 2021 12:23:41 +0000 (15:23 +0300)
If device is not idle after user closes the FD we must reset device
as next user that will try to open FD will encounter a non-functional
device.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/context.c
drivers/misc/habanalabs/common/device.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/habanalabs_drv.c

index 62d7058..19b6b04 100644 (file)
@@ -12,7 +12,6 @@
 static void hl_ctx_fini(struct hl_ctx *ctx)
 {
        struct hl_device *hdev = ctx->hdev;
-       u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
        int i;
 
        /* Release all allocated pending cb's, those cb's were never
@@ -57,14 +56,6 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 
                /* Scrub both SRAM and DRAM */
                hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
-
-               if ((!hdev->pldm) && (hdev->pdev) &&
-                               (!hdev->asic_funcs->is_device_idle(hdev,
-                                       idle_mask,
-                                       HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)))
-                       dev_notice(hdev->dev,
-                                       "device not idle after user context is closed (0x%llx, 0x%llx)\n",
-                                               idle_mask[0], idle_mask[1]);
        } else {
                dev_dbg(hdev->dev, "closing kernel context\n");
                hdev->asic_funcs->ctx_fini(ctx);
index bc58a91..0056282 100644 (file)
@@ -51,6 +51,8 @@ bool hl_device_operational(struct hl_device *hdev,
 
 static void hpriv_release(struct kref *ref)
 {
+       u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
+       bool device_is_idle = true;
        struct hl_fpriv *hpriv;
        struct hl_device *hdev;
 
@@ -71,7 +73,19 @@ static void hpriv_release(struct kref *ref)
 
        kfree(hpriv);
 
-       if (hdev->reset_upon_device_release)
+       if ((!hdev->pldm) && (hdev->pdev) &&
+                       (!hdev->asic_funcs->is_device_idle(hdev,
+                               idle_mask,
+                               HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) {
+               dev_err(hdev->dev,
+                       "device not idle after user context is closed (0x%llx_%llx)\n",
+                       idle_mask[1], idle_mask[0]);
+
+               device_is_idle = false;
+       }
+
+       if ((hdev->reset_if_device_not_idle && !device_is_idle)
+                       || hdev->reset_upon_device_release)
                hl_device_reset(hdev, 0);
 }
 
@@ -1108,8 +1122,8 @@ kill_processes:
        if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
                        HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
                dev_err(hdev->dev,
-                       "device is not idle (mask %#llx %#llx) after reset\n",
-                       idle_mask[0], idle_mask[1]);
+                       "device is not idle (mask 0x%llx_%llx) after reset\n",
+                       idle_mask[1], idle_mask[0]);
                rc = -EIO;
                goto out_err;
        }
index 56d2f41..bcb5bfd 100644 (file)
@@ -2311,6 +2311,7 @@ struct hl_device {
        u8                              rl_enable;
        u8                              reset_on_preboot_fail;
        u8                              reset_upon_device_release;
+       u8                              reset_if_device_not_idle;
 };
 
 
index 137e7dc..b55dd1c 100644 (file)
@@ -264,6 +264,7 @@ static void set_driver_behavior_per_device(struct hl_device *hdev)
        hdev->bmc_enable = 1;
        hdev->hard_reset_on_fw_events = 1;
        hdev->reset_on_preboot_fail = 1;
+       hdev->reset_if_device_not_idle = 1;
 
        hdev->reset_pcilink = 0;
        hdev->axi_drain = 0;