From 4d041216c83dd9933c7c72b40511bb3585fa1724 Mon Sep 17 00:00:00 2001 From: Yuri Nudelman Date: Sun, 13 Jun 2021 09:22:20 +0300 Subject: [PATCH] debugfs: add skip_reset_on_timeout option To be able to debug long-running CS better, without changing the userspace code, we are adding a new option through debugfs interface to skip the reset of the device in case of CS timeout. Signed-off-by: Yuri Nudelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- Documentation/ABI/testing/debugfs-driver-habanalabs | 8 ++++++++ drivers/misc/habanalabs/common/command_submission.c | 1 + drivers/misc/habanalabs/common/debugfs.c | 5 +++++ drivers/misc/habanalabs/common/habanalabs.h | 3 +++ 4 files changed, 17 insertions(+) diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs index c78fc92..e78ceb1 100644 --- a/Documentation/ABI/testing/debugfs-driver-habanalabs +++ b/Documentation/ABI/testing/debugfs-driver-habanalabs @@ -207,6 +207,14 @@ Contact: ogabbay@kernel.org Description: Sets the PCI power state. Valid values are "1" for D0 and "2" for D3Hot +What: /sys/kernel/debug/habanalabs/hl/skip_reset_on_timeout +Date: Jun 2021 +KernelVersion: 5.13 +Contact: ynudelman@habana.ai +Description: Sets the skip reset on timeout option for the device. Value of + "0" means device will be reset in case some CS has timed out, + otherwise it will not be reset. + What: /sys/kernel/debug/habanalabs/hl/stop_on_err Date: Mar 2020 KernelVersion: 5.6 diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 6d51f54..adedb28 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -663,6 +663,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP); cs->timeout_jiffies = timeout; cs->skip_reset_on_timeout = + hdev->skip_reset_on_timeout || !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT); cs->submission_time_jiffies = jiffies; INIT_LIST_HEAD(&cs->job_list); diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 8381155..703d79f 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -1278,6 +1278,11 @@ void hl_debugfs_add_device(struct hl_device *hdev) dev_entry->root, &dev_entry->blob_desc); + debugfs_create_x8("skip_reset_on_timeout", + 0644, + dev_entry->root, + &hdev->skip_reset_on_timeout); + for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { debugfs_create_file(hl_debugfs_list[i].name, 0444, diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index b4413c3..09b89fd 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2191,6 +2191,8 @@ struct hl_mmu_funcs { * @supports_staged_submission: true if staged submissions are supported * @curr_reset_cause: saves an enumerated reset cause when a hard reset is * triggered, and cleared after it is shared with preboot. + * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to + * complete instead. */ struct hl_device { struct pci_dev *pdev; @@ -2305,6 +2307,7 @@ struct hl_device { u8 device_fini_pending; u8 supports_staged_submission; u8 curr_reset_cause; + u8 skip_reset_on_timeout; /* Parameters for bring-up */ u64 nic_ports_mask; -- 2.7.4