habanalabs: report EQ fault during heartbeat
authorOhad Sharabi <osharabi@habana.ai>
Sun, 6 Jun 2021 19:38:23 +0000 (22:38 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Fri, 18 Jun 2021 12:23:41 +0000 (15:23 +0300)
In case we have EQ fault we would like to know about it.
For this, a status bitmask was added in which EQ_FAULT bit is
set by FW in case of EQ fault.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/firmware_if.c
drivers/misc/habanalabs/include/common/cpucp_if.h

index 9412e67..d5a3c78 100644 (file)
@@ -362,7 +362,7 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 
 int hl_fw_send_heartbeat(struct hl_device *hdev)
 {
-       struct cpucp_packet hb_pkt = {};
+       struct cpucp_packet hb_pkt = {0};
        u64 result;
        int rc;
 
@@ -374,7 +374,13 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
                                                sizeof(hb_pkt), 0, &result);
 
        if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
+               return -EIO;
+
+       if (le32_to_cpu(hb_pkt.status_mask) &
+                                       CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK) {
+               dev_warn(hdev->dev, "FW reported EQ fault during heartbeat\n");
                rc = -EIO;
+       }
 
        return rc;
 }
index d4dc189..80b1d5a 100644 (file)
@@ -404,6 +404,20 @@ enum cpucp_packet_id {
 #define CPUCP_PKT_RES_PLL_OUT3_SHIFT   48
 #define CPUCP_PKT_RES_PLL_OUT3_MASK    0xFFFF000000000000ull
 
+#define CPUCP_PKT_VAL_PFC_IN1_SHIFT    0
+#define CPUCP_PKT_VAL_PFC_IN1_MASK     0x0000000000000001ull
+#define CPUCP_PKT_VAL_PFC_IN2_SHIFT    1
+#define CPUCP_PKT_VAL_PFC_IN2_MASK     0x000000000000001Eull
+
+#define CPUCP_PKT_VAL_LPBK_IN1_SHIFT   0
+#define CPUCP_PKT_VAL_LPBK_IN1_MASK    0x0000000000000001ull
+#define CPUCP_PKT_VAL_LPBK_IN2_SHIFT   1
+#define CPUCP_PKT_VAL_LPBK_IN2_MASK    0x000000000000001Eull
+
+/* heartbeat status bits */
+#define CPUCP_PKT_HB_STATUS_EQ_FAULT_SHIFT             0
+#define CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK              0x00000001
+
 struct cpucp_packet {
        union {
                __le64 value;   /* For SET packets */
@@ -445,6 +459,12 @@ struct cpucp_packet {
 
                /* For get CpuCP info/EEPROM data/NIC info */
                __le32 data_max_size;
+
+               /*
+                * For any general status bitmask. Shall be used whenever the
+                * result cannot be used to hold general purpose data.
+                */
+               __le32 status_mask;
        };
 
        __le32 reserved;