From 18d1358459bd2a18f4582c9ecbdb63cb8825fce0 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 25 Jan 2023 11:38:56 +0200 Subject: [PATCH] accel/habanalabs: enable graceful reset mechanism for compute-reset The graceful reset mechanism is currently enabled only for reset requests that will end up with hard-reset. In future, reset requests due to errors in some device engines, are going to be modified to request compute-reset, as the much longer hard-reset is not really needed there. To allow it, enable graceful reset also for compute-reset, and reset after user releases the device won't be escalated to hard-reset in those cases. If watchdog expires and user didn't release the device, hard-reset will be initiated in any case. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Reviewed-by: Stanislaw Gruszka --- drivers/accel/habanalabs/common/device.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index d140eaef..2d496cd 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -778,14 +778,14 @@ static void device_hard_reset_pending(struct work_struct *work) static void device_release_watchdog_func(struct work_struct *work) { - struct hl_device_reset_work *device_release_watchdog_work = - container_of(work, struct hl_device_reset_work, reset_work.work); - struct hl_device *hdev = device_release_watchdog_work->hdev; + struct hl_device_reset_work *watchdog_work = + container_of(work, struct hl_device_reset_work, reset_work.work); + struct hl_device *hdev = watchdog_work->hdev; u32 flags; - dev_dbg(hdev->dev, "Device wasn't released in time. Initiate device reset.\n"); + dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n"); - flags = device_release_watchdog_work->flags | HL_DRV_RESET_FROM_WD_THR; + flags = watchdog_work->flags | HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_WD_THR; hl_device_reset(hdev, flags); } @@ -1555,15 +1555,17 @@ do_reset: /* Cancel the device release watchdog work if required. * In case of reset-upon-device-release while the release watchdog work is - * scheduled, do hard-reset instead of compute-reset. + * scheduled due to a hard-reset, do hard-reset instead of compute-reset. */ if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) { + struct hl_device_reset_work *watchdog_work = + &hdev->device_release_watchdog_work; + hdev->reset_info.watchdog_active = 0; if (!from_watchdog_thread) - cancel_delayed_work_sync( - &hdev->device_release_watchdog_work.reset_work); + cancel_delayed_work_sync(&watchdog_work->reset_work); - if (from_dev_release) { + if (from_dev_release && (watchdog_work->flags & HL_DRV_RESET_HARD)) { hdev->reset_info.in_compute_reset = 0; flags |= HL_DRV_RESET_HARD; flags &= ~HL_DRV_RESET_DEV_RELEASE; @@ -1890,10 +1892,6 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask) { struct hl_ctx *ctx = NULL; - /* Device release watchdog is only for hard reset */ - if (!(flags & HL_DRV_RESET_HARD) && hdev->asic_prop.allow_inference_soft_reset) - goto device_reset; - /* F/W reset cannot be postponed */ if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW) goto device_reset; @@ -1921,7 +1919,7 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask) goto out; hdev->device_release_watchdog_work.flags = flags; - dev_dbg(hdev->dev, "Device is going to be reset in %u sec unless being released\n", + dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n", hdev->device_release_watchdog_timeout_sec); schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work, msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000)); -- 2.7.4