habanalabs/gaudi: add FW alive event support
authorOfir Bitton <obitton@habana.ai>
Wed, 2 Jun 2021 08:56:31 +0000 (11:56 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Fri, 18 Jun 2021 12:23:41 +0000 (15:23 +0300)
In order for driver to be aware of process or thread crashes inside
GAUDI's CPU, we introduce a new event which contains all relevant
information. Upon event reception, driver will dump information and
will reset the device.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/include/common/cpucp_if.h
drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h
drivers/misc/habanalabs/include/gaudi/gaudi_async_ids_map_extended.h

index 476dbe6..953c5a5 100644 (file)
@@ -7451,6 +7451,16 @@ static void gaudi_print_out_of_sync_info(struct hl_device *hdev,
                        sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
 }
 
+static void gaudi_print_fw_alive_info(struct hl_device *hdev,
+                                       struct hl_eq_fw_alive *fw_alive)
+{
+       dev_err(hdev->dev,
+               "FW alive report: severity=%s, process_id=%u, thread_id=%u, uptime=%llu seconds\n",
+               (fw_alive->severity == FW_ALIVE_SEVERITY_MINOR) ?
+               "Minor" : "Critical", fw_alive->process_id,
+               fw_alive->thread_id, fw_alive->uptime_seconds);
+}
+
 static int gaudi_soft_reset_late_init(struct hl_device *hdev)
 {
        struct gaudi_device *gaudi = hdev->asic_specific;
@@ -7902,6 +7912,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
                gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
                goto reset_device;
 
+       case GAUDI_EVENT_FW_ALIVE_S:
+               gaudi_print_irq_info(hdev, event_type, false);
+               gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
+               goto reset_device;
+
        default:
                dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
                                event_type);
index c7da622..d4dc189 100644 (file)
@@ -84,6 +84,20 @@ struct hl_eq_sm_sei_data {
        __u8 pad[3];
 };
 
+enum hl_fw_alive_severity {
+       FW_ALIVE_SEVERITY_MINOR,
+       FW_ALIVE_SEVERITY_CRITICAL
+};
+
+struct hl_eq_fw_alive {
+       __le64 uptime_seconds;
+       __le32 process_id;
+       __le32 thread_id;
+       /* enum hl_fw_alive_severity */
+       __u8 severity;
+       __u8 pad[7];
+};
+
 struct hl_eq_entry {
        struct hl_eq_header hdr;
        union {
@@ -91,6 +105,7 @@ struct hl_eq_entry {
                struct hl_eq_hbm_ecc_data hbm_ecc_data;
                struct hl_eq_sm_sei_data sm_sei_data;
                struct cpucp_pkt_sync_err pkt_sync_err;
+               struct hl_eq_fw_alive fw_alive;
                __le64 data[7];
        };
 };
index e8651ab..f66c759 100644 (file)
@@ -303,6 +303,7 @@ enum gaudi_async_event_id {
        GAUDI_EVENT_NIC3_QP1 = 619,
        GAUDI_EVENT_NIC4_QP0 = 620,
        GAUDI_EVENT_NIC4_QP1 = 621,
+       GAUDI_EVENT_FW_ALIVE_S = 645,
        GAUDI_EVENT_DEV_RESET_REQ = 646,
        GAUDI_EVENT_PKT_QUEUE_OUT_SYNC = 647,
        GAUDI_EVENT_FIX_POWER_ENV_S = 658,
index 3dc79c1..e87554a 100644 (file)
@@ -669,7 +669,7 @@ static struct gaudi_async_events_ids_map gaudi_irq_map_table[] = {
        { .fc_id = 642, .cpu_id = 491, .valid = 0, .name = "" },
        { .fc_id = 643, .cpu_id = 492, .valid = 0, .name = "" },
        { .fc_id = 644, .cpu_id = 493, .valid = 0, .name = "" },
-       { .fc_id = 645, .cpu_id = 494, .valid = 0, .name = "" },
+       { .fc_id = 645, .cpu_id = 494, .valid = 1, .name = "FW_ALIVE_S" },
        { .fc_id = 646, .cpu_id = 495, .valid = 1, .name = "DEV_RESET_REQ" },
        { .fc_id = 647, .cpu_id = 496, .valid = 1,
                .name = "PKT_QUEUE_OUT_SYNC" },