debugfs: add skip_reset_on_timeout option
authorYuri Nudelman <ynudelman@habana.ai>
Sun, 13 Jun 2021 06:22:20 +0000 (09:22 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Fri, 18 Jun 2021 12:23:43 +0000 (15:23 +0300)
To be able to debug long-running CS better, without changing the
userspace code, we are adding a new option through debugfs interface
to skip the reset of the device in case of CS timeout.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Documentation/ABI/testing/debugfs-driver-habanalabs
drivers/misc/habanalabs/common/command_submission.c
drivers/misc/habanalabs/common/debugfs.c
drivers/misc/habanalabs/common/habanalabs.h

index c78fc92..e78ceb1 100644 (file)
@@ -207,6 +207,14 @@ Contact:        ogabbay@kernel.org
 Description:    Sets the PCI power state. Valid values are "1" for D0 and "2"
                 for D3Hot
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/skip_reset_on_timeout
+Date:           Jun 2021
+KernelVersion:  5.13
+Contact:        ynudelman@habana.ai
+Description:    Sets the skip reset on timeout option for the device. Value of
+                "0" means device will be reset in case some CS has timed out,
+                otherwise it will not be reset.
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
 Date:           Mar 2020
 KernelVersion:  5.6
index 6d51f54..adedb28 100644 (file)
@@ -663,6 +663,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
        cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
        cs->timeout_jiffies = timeout;
        cs->skip_reset_on_timeout =
+               hdev->skip_reset_on_timeout ||
                !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
        cs->submission_time_jiffies = jiffies;
        INIT_LIST_HEAD(&cs->job_list);
index 8381155..703d79f 100644 (file)
@@ -1278,6 +1278,11 @@ void hl_debugfs_add_device(struct hl_device *hdev)
                                dev_entry->root,
                                &dev_entry->blob_desc);
 
+       debugfs_create_x8("skip_reset_on_timeout",
+                               0644,
+                               dev_entry->root,
+                               &hdev->skip_reset_on_timeout);
+
        for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
                debugfs_create_file(hl_debugfs_list[i].name,
                                        0444,
index b4413c3..09b89fd 100644 (file)
@@ -2191,6 +2191,8 @@ struct hl_mmu_funcs {
  * @supports_staged_submission: true if staged submissions are supported
  * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
  *                    triggered, and cleared after it is shared with preboot.
+ * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
+ *                         complete instead.
  */
 struct hl_device {
        struct pci_dev                  *pdev;
@@ -2305,6 +2307,7 @@ struct hl_device {
        u8                              device_fini_pending;
        u8                              supports_staged_submission;
        u8                              curr_reset_cause;
+       u8                              skip_reset_on_timeout;
 
        /* Parameters for bring-up */
        u64                             nic_ports_mask;