habanalabs: enable stop-on-error debugfs setting per ASIC
authorTomer Tayar <ttayar@habana.ai>
Wed, 12 Jan 2022 18:08:01 +0000 (20:08 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 28 Feb 2022 12:22:05 +0000 (14:22 +0200)
On Goya and Gaudi, the stop-on-error configuration can be set via
debugfs. However, in future devices, this configuration will always be
enabled.
Modify the debugfs node to be allowed only for ASICs that support this
dynamic configuration.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Documentation/ABI/testing/debugfs-driver-habanalabs
drivers/misc/habanalabs/common/debugfs.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/goya/goya.c

index 783001a..bcf6915 100644 (file)
@@ -222,6 +222,7 @@ KernelVersion:  5.6
 Contact:        ogabbay@kernel.org
 Description:    Sets the stop-on_error option for the device engines. Value of
                 "0" is for disable, otherwise enable.
+                Relevant only for GOYA and GAUDI.
 
 What:           /sys/kernel/debug/habanalabs/hl<n>/timeout_locked
 Date:           Sep 2021
index e3ee5f4..9f0aaf0 100644 (file)
@@ -1071,6 +1071,9 @@ static ssize_t hl_stop_on_err_read(struct file *f, char __user *buf,
        char tmp_buf[200];
        ssize_t rc;
 
+       if (!hdev->asic_prop.configurable_stop_on_err)
+               return -EOPNOTSUPP;
+
        if (*ppos)
                return 0;
 
@@ -1089,6 +1092,9 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
        u32 value;
        ssize_t rc;
 
+       if (!hdev->asic_prop.configurable_stop_on_err)
+               return -EOPNOTSUPP;
+
        if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev,
                                "Can't change stop on error during reset\n");
index b06e2b0..93116fe 100644 (file)
@@ -561,6 +561,7 @@ struct hl_hints_range {
  *                              use-case of doing soft-reset in training (due
  *                              to the fact that training runs on multiple
  *                              devices)
+ * @configurable_stop_on_err: is stop-on-error option configurable via debugfs.
  */
 struct asic_fixed_properties {
        struct hw_queue_properties      *hw_queues_props;
@@ -644,6 +645,7 @@ struct asic_fixed_properties {
        u8                              use_get_power_for_reset_history;
        u8                              supports_soft_reset;
        u8                              allow_inference_soft_reset;
+       u8                              configurable_stop_on_err;
 };
 
 /**
index f2242aa..61aa6dc 100644 (file)
@@ -669,6 +669,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 
        prop->use_get_power_for_reset_history = true;
 
+       prop->configurable_stop_on_err = true;
+
        return 0;
 }
 
index 3785fb3..c8143b6 100644 (file)
@@ -483,6 +483,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 
        prop->use_get_power_for_reset_history = true;
 
+       prop->configurable_stop_on_err = true;
+
        return 0;
 }