From: Ofir Bitton Date: Mon, 5 Oct 2020 10:44:59 +0000 (+0300) Subject: habanalabs/gaudi: fetch HBM ecc info from FW X-Git-Tag: accepted/tizen/unified/20230118.172025~8316^2~60^2~24 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5a2998f46c8555815b1bf367daf0bf42b0440300;p=platform%2Fkernel%2Flinux-rpi.git habanalabs/gaudi: fetch HBM ecc info from FW Once FW security is enabled there is no access to HBM ecc registers, need to read values from FW using a dedicated interface. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 856cb1b..f93809f 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -6839,10 +6839,41 @@ static int gaudi_soft_reset_late_init(struct hl_device *hdev) return hl_fw_unmask_irq_arr(hdev, gaudi->events, sizeof(gaudi->events)); } -static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device) +static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device, + struct hl_eq_hbm_ecc_data *hbm_ecc_data) { - int ch, err = 0; - u32 base, val, val2; + u32 base, val, val2, wr_par, rd_par, ca_par, derr, serr, type, ch; + int err = 0; + + if (!hdev->asic_prop.fw_security_disabled) { + if (!hbm_ecc_data) { + dev_err(hdev->dev, "No FW ECC data"); + return 0; + } + + wr_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + rd_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + ca_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + derr = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_DERR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + serr = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_SERR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + type = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + ch = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + + dev_err(hdev->dev, + "HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n", + device, ch, wr_par, rd_par, ca_par, serr, derr); + + err = 1; + + return 0; + } base = GAUDI_HBM_CFG_BASE + device * GAUDI_HBM_CFG_OFFSET; for (ch = 0 ; ch < GAUDI_HBM_CHANNELS ; ch++) { @@ -6858,7 +6889,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device) val2 = RREG32(base + ch * 0x1000 + 0x060); dev_err(hdev->dev, - "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DED_CNT=%d\n", + "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DEC_CNT=%d\n", device, ch * 2, RREG32(base + ch * 0x1000 + 0x064), (val2 & 0x200) >> 9, (val2 & 0xFC00) >> 10, @@ -6878,7 +6909,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device) val2 = RREG32(base + ch * 0x1000 + 0x070); dev_err(hdev->dev, - "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DED_CNT=%d\n", + "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DEC_CNT=%d\n", device, ch * 2 + 1, RREG32(base + ch * 0x1000 + 0x074), (val2 & 0x200) >> 9, (val2 & 0xFC00) >> 10, @@ -7079,7 +7110,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_HBM3_SPI_0: gaudi_print_irq_info(hdev, event_type, false); gaudi_hbm_read_interrupts(hdev, - gaudi_hbm_event_to_dev(event_type)); + gaudi_hbm_event_to_dev(event_type), + &eq_entry->hbm_ecc_data); if (hdev->hard_reset_on_fw_events) hl_device_reset(hdev, true, false); break; @@ -7090,7 +7122,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_HBM3_SPI_1: gaudi_print_irq_info(hdev, event_type, false); gaudi_hbm_read_interrupts(hdev, - gaudi_hbm_event_to_dev(event_type)); + gaudi_hbm_event_to_dev(event_type), + &eq_entry->hbm_ecc_data); break; case GAUDI_EVENT_TPC0_DEC: diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h index 1c1e2b3..759c068 100644 --- a/drivers/misc/habanalabs/include/common/cpucp_if.h +++ b/drivers/misc/habanalabs/include/common/cpucp_if.h @@ -11,6 +11,37 @@ #include #include +#define NUM_HBM_PSEUDO_CH 2 +#define NUM_HBM_CH_PER_DEV 8 +#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_SHIFT 0 +#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK 0x00000001 +#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_SHIFT 1 +#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK 0x00000002 +#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_SHIFT 2 +#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK 0x00000004 +#define CPUCP_PKT_HBM_ECC_INFO_DERR_SHIFT 3 +#define CPUCP_PKT_HBM_ECC_INFO_DERR_MASK 0x00000008 +#define CPUCP_PKT_HBM_ECC_INFO_SERR_SHIFT 4 +#define CPUCP_PKT_HBM_ECC_INFO_SERR_MASK 0x00000010 +#define CPUCP_PKT_HBM_ECC_INFO_TYPE_SHIFT 5 +#define CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK 0x00000020 +#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_SHIFT 6 +#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK 0x000007C0 + +struct hl_eq_hbm_ecc_data { + /* SERR counter */ + __le32 sec_cnt; + /* DERR counter */ + __le32 dec_cnt; + /* Supplemental Information according to the mask bits */ + __le32 hbm_ecc_info; + /* Address in hbm where the ecc happened */ + __le32 first_addr; + /* SERR continuous address counter */ + __le32 sec_cont_cnt; + __le32 pad; +}; + /* * EVENT QUEUE */ @@ -31,6 +62,7 @@ struct hl_eq_entry { struct hl_eq_header hdr; union { struct hl_eq_ecc_data ecc_data; + struct hl_eq_hbm_ecc_data hbm_ecc_data; __le64 data[7]; }; };