habanalabs: reset device upon fw read failure
authorfarah kassabri <fkassabri@habana.ai>
Wed, 14 Oct 2020 12:17:36 +0000 (15:17 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 30 Nov 2020 08:47:32 +0000 (10:47 +0200)
failure in reading pre-boot verion is not handled correctly,
upon failure we need to reset the device in order to be able
to reinstall the driver.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/firmware_if.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/pci.c
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/goya/goya.c

index fb9d901..2fc12e5 100644 (file)
@@ -607,7 +607,9 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
                return -EIO;
        }
 
-       hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
+       rc = hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
+       if (rc)
+               return rc;
 
        security_status = RREG32(cpu_security_boot_status_reg);
 
index f00891d..0f4f8ef 100644 (file)
@@ -927,7 +927,7 @@ struct hl_asic_funcs {
        void (*ctx_fini)(struct hl_ctx *ctx);
        int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
        u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx);
-       void (*read_device_fw_version)(struct hl_device *hdev,
+       int (*read_device_fw_version)(struct hl_device *hdev,
                                        enum hl_fw_component fwc);
        int (*load_firmware_to_device)(struct hl_device *hdev);
        int (*load_boot_fit_to_device)(struct hl_device *hdev);
index 211f319..02152d8 100644 (file)
@@ -390,8 +390,11 @@ int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
        rc = hl_fw_read_preboot_status(hdev, cpu_boot_status_reg,
                        cpu_security_boot_status_reg, boot_err0_reg,
                        preboot_ver_timeout);
-       if (rc)
+       if (rc) {
+               dev_err(hdev->dev, "Failed to read preboot version\n");
+               hdev->asic_funcs->hw_fini(hdev, true);
                goto unmap_pci_bars;
+       }
 
        return 0;
 
index ecfcfdf..6aa3e38 100644 (file)
@@ -3603,7 +3603,7 @@ static int gaudi_load_boot_fit_to_device(struct hl_device *hdev)
        return hl_fw_load_fw_to_device(hdev, GAUDI_BOOT_FIT_FILE, dst, 0, 0);
 }
 
-static void gaudi_read_device_fw_version(struct hl_device *hdev,
+static int gaudi_read_device_fw_version(struct hl_device *hdev,
                                        enum hl_fw_component fwc)
 {
        const char *name;
@@ -3623,7 +3623,7 @@ static void gaudi_read_device_fw_version(struct hl_device *hdev,
                break;
        default:
                dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc);
-               return;
+               return -EIO;
        }
 
        ver_off &= ~((u32)SRAM_BASE_ADDR);
@@ -3635,7 +3635,10 @@ static void gaudi_read_device_fw_version(struct hl_device *hdev,
                dev_err(hdev->dev, "%s version offset (0x%x) is above SRAM\n",
                                                                name, ver_off);
                strcpy(dest, "unavailable");
+               return -EIO;
        }
+
+       return 0;
 }
 
 static int gaudi_init_cpu(struct hl_device *hdev)
@@ -3925,16 +3928,18 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 
        WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap);
 
-       gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
-                                       HW_CAP_HBM | HW_CAP_PCI_DMA |
-                                       HW_CAP_MME | HW_CAP_TPC_MASK |
-                                       HW_CAP_HBM_DMA | HW_CAP_PLL |
-                                       HW_CAP_NIC_MASK | HW_CAP_MMU |
-                                       HW_CAP_SRAM_SCRAMBLER |
-                                       HW_CAP_HBM_SCRAMBLER |
-                                       HW_CAP_CLK_GATE);
+       if (gaudi) {
+               gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
+                               HW_CAP_HBM | HW_CAP_PCI_DMA |
+                               HW_CAP_MME | HW_CAP_TPC_MASK |
+                               HW_CAP_HBM_DMA | HW_CAP_PLL |
+                               HW_CAP_NIC_MASK | HW_CAP_MMU |
+                               HW_CAP_SRAM_SCRAMBLER |
+                               HW_CAP_HBM_SCRAMBLER |
+                               HW_CAP_CLK_GATE);
 
-       memset(gaudi->events_stat, 0, sizeof(gaudi->events_stat));
+               memset(gaudi->events_stat, 0, sizeof(gaudi->events_stat));
+       }
 }
 
 static int gaudi_suspend(struct hl_device *hdev)
index ab46794..a0580b8 100644 (file)
@@ -2341,7 +2341,7 @@ static int goya_load_boot_fit_to_device(struct hl_device *hdev)
  * FW component passes an offset from SRAM_BASE_ADDR in SCRATCHPAD_xx.
  * The version string should be located by that offset.
  */
-static void goya_read_device_fw_version(struct hl_device *hdev,
+static int goya_read_device_fw_version(struct hl_device *hdev,
                                        enum hl_fw_component fwc)
 {
        const char *name;
@@ -2361,7 +2361,7 @@ static void goya_read_device_fw_version(struct hl_device *hdev,
                break;
        default:
                dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc);
-               return;
+               return -EIO;
        }
 
        ver_off &= ~((u32)SRAM_BASE_ADDR);
@@ -2373,7 +2373,11 @@ static void goya_read_device_fw_version(struct hl_device *hdev,
                dev_err(hdev->dev, "%s version offset (0x%x) is above SRAM\n",
                                                                name, ver_off);
                strcpy(dest, "unavailable");
+
+               return -EIO;
        }
+
+       return 0;
 }
 
 static int goya_init_cpu(struct hl_device *hdev)
@@ -2644,12 +2648,14 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
        WREG32(mmPSOC_GLOBAL_CONF_SW_BTM_FSM,
                        0xA << PSOC_GLOBAL_CONF_SW_BTM_FSM_CTRL_SHIFT);
 
-       goya->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
-                                       HW_CAP_DDR_0 | HW_CAP_DDR_1 |
-                                       HW_CAP_DMA | HW_CAP_MME |
-                                       HW_CAP_MMU | HW_CAP_TPC_MBIST |
-                                       HW_CAP_GOLDEN | HW_CAP_TPC);
-       memset(goya->events_stat, 0, sizeof(goya->events_stat));
+       if (goya) {
+               goya->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
+                               HW_CAP_DDR_0 | HW_CAP_DDR_1 |
+                               HW_CAP_DMA | HW_CAP_MME |
+                               HW_CAP_MMU | HW_CAP_TPC_MBIST |
+                               HW_CAP_GOLDEN | HW_CAP_TPC);
+               memset(goya->events_stat, 0, sizeof(goya->events_stat));
+       }
 }
 
 int goya_suspend(struct hl_device *hdev)