From cb5fb665f30388cf8cb9becae86dcb84ace0ca88 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Sun, 30 Oct 2022 13:08:37 +0200 Subject: [PATCH] habanalabs/gaudi: add razwi notify event Each time razwi (read-only zero, write ignore) happens, besides capturing its data, also notify the user about it. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 8 +++++++ drivers/misc/habanalabs/common/habanalabs.h | 2 ++ drivers/misc/habanalabs/gaudi/gaudi.c | 37 +++++++++++++++-------------- include/uapi/misc/habanalabs.h | 2 ++ 4 files changed, 31 insertions(+), 18 deletions(-) diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 35ed494..d1a6095 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -2409,6 +2409,14 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_ num_of_engines * sizeof(u16)); hdev->captured_err_info.razwi.flags = flags; } + +void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags, u64 *event_mask) +{ + hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags); + *event_mask |= HL_NOTIFIER_EVENT_RAZWI; +} + static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu) { struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index e391e79..d9335f3 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -3812,6 +3812,8 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg, __printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...); void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, u8 flags); +void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags, u64 *event_mask); void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 3dfb9ec..035865c 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7301,7 +7301,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e } static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, - bool razwi) + bool razwi, u64 *event_mask) { bool is_read = false, is_write = false; u16 engine_id[2], num_of_razwi_eng = 0; @@ -7337,7 +7337,8 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, num_of_razwi_eng = 1; } - hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags); + hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags, + event_mask); } } @@ -7675,7 +7676,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR: case GAUDI_EVENT_MMU_DERR: case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; @@ -7685,7 +7686,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_AXI_ECC: case GAUDI_EVENT_L2_RAM_ECC: case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; @@ -7694,7 +7695,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM1_SPI_0: case GAUDI_EVENT_HBM2_SPI_0: case GAUDI_EVENT_HBM3_SPI_0: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_hbm_read_interrupts(hdev, gaudi_hbm_event_to_dev(event_type), &eq_entry->hbm_ecc_data); @@ -7706,7 +7707,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM1_SPI_1: case GAUDI_EVENT_HBM2_SPI_1: case GAUDI_EVENT_HBM3_SPI_1: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_hbm_read_interrupts(hdev, gaudi_hbm_event_to_dev(event_type), &eq_entry->hbm_ecc_data); @@ -7728,7 +7729,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr * if the event is a TPC Assertion or a "real" TPC DEC. */ event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT; - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); reset_required = gaudi_tpc_read_interrupts(hdev, tpc_dec_event_to_tpc_id(event_type), "AXI_SLV_DEC_Error"); @@ -7753,7 +7754,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_TPC5_KRN_ERR: case GAUDI_EVENT_TPC6_KRN_ERR: case GAUDI_EVENT_TPC7_KRN_ERR: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); reset_required = gaudi_tpc_read_interrupts(hdev, tpc_krn_event_to_tpc_id(event_type), "KRN_ERR"); @@ -7792,7 +7793,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR: fallthrough; case GAUDI_EVENT_MMU_SERR: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; @@ -7802,14 +7803,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_CPU_AXI_SPLITTER: case GAUDI_EVENT_PSOC_AXI_DEC: case GAUDI_EVENT_PSOC_PRSTN_FALL: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI_EVENT_MMU_PAGE_FAULT: case GAUDI_EVENT_MMU_WR_PERM: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -7838,14 +7839,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_NIC4_QM1: case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE: case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); gaudi_handle_qman_err(hdev, event_type, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= (HL_NOTIFIER_EVENT_USER_ENGINE_ERR | HL_NOTIFIER_EVENT_DEVICE_RESET); break; case GAUDI_EVENT_RAZWI_OR_ADC_SW: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; goto reset_device; @@ -7858,7 +7859,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_TPC6_BMON_SPMU: case GAUDI_EVENT_TPC7_BMON_SPMU: case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -7870,7 +7871,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr break; case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_sm_sei_info(hdev, event_type, &eq_entry->sm_sei_data); rc = hl_state_dump(hdev); @@ -7899,18 +7900,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr break; case GAUDI_EVENT_DEV_RESET_REQ: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; case GAUDI_EVENT_FW_ALIVE_S: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 5834399..7747e19 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -721,6 +721,7 @@ enum hl_server_type { * HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable * HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state * HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error + * HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened */ #define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0) #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1) @@ -729,6 +730,7 @@ enum hl_server_type { #define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4) #define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5) #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6) +#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7) /* Opcode for management ioctl * -- 2.7.4