habanalabs/gaudi: fetch HBM ecc info from FW
authorOfir Bitton <obitton@habana.ai>
Mon, 5 Oct 2020 10:44:59 +0000 (13:44 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 30 Nov 2020 08:47:34 +0000 (10:47 +0200)
Once FW security is enabled there is no access to HBM ecc registers,
need to read values from FW using a dedicated interface.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/include/common/cpucp_if.h

index 856cb1b..f93809f 100644 (file)
@@ -6839,10 +6839,41 @@ static int gaudi_soft_reset_late_init(struct hl_device *hdev)
        return hl_fw_unmask_irq_arr(hdev, gaudi->events, sizeof(gaudi->events));
 }
 
-static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device)
+static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
+                       struct hl_eq_hbm_ecc_data *hbm_ecc_data)
 {
-       int ch, err = 0;
-       u32 base, val, val2;
+       u32 base, val, val2, wr_par, rd_par, ca_par, derr, serr, type, ch;
+       int err = 0;
+
+       if (!hdev->asic_prop.fw_security_disabled) {
+               if (!hbm_ecc_data) {
+                       dev_err(hdev->dev, "No FW ECC data");
+                       return 0;
+               }
+
+               wr_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK,
+                               le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
+               rd_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK,
+                               le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
+               ca_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK,
+                               le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
+               derr = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_DERR_MASK,
+                               le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
+               serr = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_SERR_MASK,
+                               le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
+               type = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK,
+                               le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
+               ch = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK,
+                               le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
+
+               dev_err(hdev->dev,
+                       "HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
+                       device, ch, wr_par, rd_par, ca_par, serr, derr);
+
+               err = 1;
+
+               return 0;
+       }
 
        base = GAUDI_HBM_CFG_BASE + device * GAUDI_HBM_CFG_OFFSET;
        for (ch = 0 ; ch < GAUDI_HBM_CHANNELS ; ch++) {
@@ -6858,7 +6889,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device)
 
                        val2 = RREG32(base + ch * 0x1000 + 0x060);
                        dev_err(hdev->dev,
-                               "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DED_CNT=%d\n",
+                               "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DEC_CNT=%d\n",
                                device, ch * 2,
                                RREG32(base + ch * 0x1000 + 0x064),
                                (val2 & 0x200) >> 9, (val2 & 0xFC00) >> 10,
@@ -6878,7 +6909,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device)
 
                        val2 = RREG32(base + ch * 0x1000 + 0x070);
                        dev_err(hdev->dev,
-                               "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DED_CNT=%d\n",
+                               "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DEC_CNT=%d\n",
                                device, ch * 2 + 1,
                                RREG32(base + ch * 0x1000 + 0x074),
                                (val2 & 0x200) >> 9, (val2 & 0xFC00) >> 10,
@@ -7079,7 +7110,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
        case GAUDI_EVENT_HBM3_SPI_0:
                gaudi_print_irq_info(hdev, event_type, false);
                gaudi_hbm_read_interrupts(hdev,
-                                         gaudi_hbm_event_to_dev(event_type));
+                               gaudi_hbm_event_to_dev(event_type),
+                               &eq_entry->hbm_ecc_data);
                if (hdev->hard_reset_on_fw_events)
                        hl_device_reset(hdev, true, false);
                break;
@@ -7090,7 +7122,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
        case GAUDI_EVENT_HBM3_SPI_1:
                gaudi_print_irq_info(hdev, event_type, false);
                gaudi_hbm_read_interrupts(hdev,
-                                         gaudi_hbm_event_to_dev(event_type));
+                               gaudi_hbm_event_to_dev(event_type),
+                               &eq_entry->hbm_ecc_data);
                break;
 
        case GAUDI_EVENT_TPC0_DEC:
index 1c1e2b3..759c068 100644 (file)
 #include <linux/types.h>
 #include <linux/if_ether.h>
 
+#define NUM_HBM_PSEUDO_CH                              2
+#define NUM_HBM_CH_PER_DEV                             8
+#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_SHIFT            0
+#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK             0x00000001
+#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_SHIFT            1
+#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK             0x00000002
+#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_SHIFT            2
+#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK             0x00000004
+#define CPUCP_PKT_HBM_ECC_INFO_DERR_SHIFT              3
+#define CPUCP_PKT_HBM_ECC_INFO_DERR_MASK               0x00000008
+#define CPUCP_PKT_HBM_ECC_INFO_SERR_SHIFT              4
+#define CPUCP_PKT_HBM_ECC_INFO_SERR_MASK               0x00000010
+#define CPUCP_PKT_HBM_ECC_INFO_TYPE_SHIFT              5
+#define CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK               0x00000020
+#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_SHIFT            6
+#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK             0x000007C0
+
+struct hl_eq_hbm_ecc_data {
+       /* SERR counter */
+       __le32 sec_cnt;
+       /* DERR counter */
+       __le32 dec_cnt;
+       /* Supplemental Information according to the mask bits */
+       __le32 hbm_ecc_info;
+       /* Address in hbm where the ecc happened */
+       __le32 first_addr;
+       /* SERR continuous address counter */
+       __le32 sec_cont_cnt;
+       __le32 pad;
+};
+
 /*
  * EVENT QUEUE
  */
@@ -31,6 +62,7 @@ struct hl_eq_entry {
        struct hl_eq_header hdr;
        union {
                struct hl_eq_ecc_data ecc_data;
+               struct hl_eq_hbm_ecc_data hbm_ecc_data;
                __le64 data[7];
        };
 };