From 84586de496103453c0c8dbf5c233f10381644cf5 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 20 May 2021 13:30:31 +0300 Subject: [PATCH] habanalabs: reset device upon FD close if not idle If device is not idle after user closes the FD we must reset device as next user that will try to open FD will encounter a non-functional device. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 9 --------- drivers/misc/habanalabs/common/device.c | 20 +++++++++++++++++--- drivers/misc/habanalabs/common/habanalabs.h | 1 + drivers/misc/habanalabs/common/habanalabs_drv.c | 1 + 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 62d7058..19b6b04 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -12,7 +12,6 @@ static void hl_ctx_fini(struct hl_ctx *ctx) { struct hl_device *hdev = ctx->hdev; - u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; int i; /* Release all allocated pending cb's, those cb's were never @@ -57,14 +56,6 @@ static void hl_ctx_fini(struct hl_ctx *ctx) /* Scrub both SRAM and DRAM */ hdev->asic_funcs->scrub_device_mem(hdev, 0, 0); - - if ((!hdev->pldm) && (hdev->pdev) && - (!hdev->asic_funcs->is_device_idle(hdev, - idle_mask, - HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) - dev_notice(hdev->dev, - "device not idle after user context is closed (0x%llx, 0x%llx)\n", - idle_mask[0], idle_mask[1]); } else { dev_dbg(hdev->dev, "closing kernel context\n"); hdev->asic_funcs->ctx_fini(ctx); diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index bc58a91..0056282 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -51,6 +51,8 @@ bool hl_device_operational(struct hl_device *hdev, static void hpriv_release(struct kref *ref) { + u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; + bool device_is_idle = true; struct hl_fpriv *hpriv; struct hl_device *hdev; @@ -71,7 +73,19 @@ static void hpriv_release(struct kref *ref) kfree(hpriv); - if (hdev->reset_upon_device_release) + if ((!hdev->pldm) && (hdev->pdev) && + (!hdev->asic_funcs->is_device_idle(hdev, + idle_mask, + HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) { + dev_err(hdev->dev, + "device not idle after user context is closed (0x%llx_%llx)\n", + idle_mask[1], idle_mask[0]); + + device_is_idle = false; + } + + if ((hdev->reset_if_device_not_idle && !device_is_idle) + || hdev->reset_upon_device_release) hl_device_reset(hdev, 0); } @@ -1108,8 +1122,8 @@ kill_processes: if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) { dev_err(hdev->dev, - "device is not idle (mask %#llx %#llx) after reset\n", - idle_mask[0], idle_mask[1]); + "device is not idle (mask 0x%llx_%llx) after reset\n", + idle_mask[1], idle_mask[0]); rc = -EIO; goto out_err; } diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 56d2f41..bcb5bfd 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2311,6 +2311,7 @@ struct hl_device { u8 rl_enable; u8 reset_on_preboot_fail; u8 reset_upon_device_release; + u8 reset_if_device_not_idle; }; diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 137e7dc..b55dd1c 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -264,6 +264,7 @@ static void set_driver_behavior_per_device(struct hl_device *hdev) hdev->bmc_enable = 1; hdev->hard_reset_on_fw_events = 1; hdev->reset_on_preboot_fail = 1; + hdev->reset_if_device_not_idle = 1; hdev->reset_pcilink = 0; hdev->axi_drain = 0; -- 2.7.4