habanalabs: ignore EEPROM errors during boot
authorOfir Bitton <obitton@habana.ai>
Tue, 23 Aug 2022 13:23:56 +0000 (16:23 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Sun, 18 Sep 2022 10:29:53 +0000 (13:29 +0300)
EEPROM errors reported by firmware are basically warnings and
should not fail the boot process.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/firmware_if.c
drivers/misc/habanalabs/include/common/hl_boot_if.h

index 12d0f18..4ede4bb 100644 (file)
@@ -573,6 +573,15 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
                dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);
 
        /* All warnings should go here in order not to reach the unknown error validation */
+       if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) {
+               dev_warn(hdev->dev,
+                       "Device boot warning - EEPROM failure detected, default settings applied\n");
+               /* This is a warning so we don't want it to disable the
+                * device
+                */
+               err_val &= ~CPU_BOOT_ERR0_EEPROM_FAIL;
+       }
+
        if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
                dev_warn(hdev->dev,
                        "Device boot warning - Skipped DRAM initialization\n");
index f2f6488..2e45be5 100644 (file)
@@ -34,6 +34,7 @@ enum cpu_boot_err {
        CPU_BOOT_ERR_BINNING_FAIL = 19,
        CPU_BOOT_ERR_TPM_FAIL = 20,
        CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL = 21,
+       CPU_BOOT_ERR_EEPROM_FAIL = 22,
        CPU_BOOT_ERR_ENABLED = 31,
        CPU_BOOT_ERR_SCND_EN = 63,
        CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */
@@ -115,6 +116,9 @@ enum cpu_boot_err {
  * CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL  Failed to set threshold for tmperature
  *                                     sensor.
  *
+ * CPU_BOOT_ERR_EEPROM_FAIL            Failed reading EEPROM data. Defaults
+ *                                     are used.
+ *
  * CPU_BOOT_ERR0_ENABLED               Error registers enabled.
  *                                     This is a main indication that the
  *                                     running FW populates the error
@@ -139,6 +143,7 @@ enum cpu_boot_err {
 #define CPU_BOOT_ERR0_BINNING_FAIL             (1 << CPU_BOOT_ERR_BINNING_FAIL)
 #define CPU_BOOT_ERR0_TPM_FAIL                 (1 << CPU_BOOT_ERR_TPM_FAIL)
 #define CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL     (1 << CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL)
+#define CPU_BOOT_ERR0_EEPROM_FAIL              (1 << CPU_BOOT_ERR_EEPROM_FAIL)
 #define CPU_BOOT_ERR0_ENABLED                  (1 << CPU_BOOT_ERR_ENABLED)
 #define CPU_BOOT_ERR1_ENABLED                  (1 << CPU_BOOT_ERR_ENABLED)