From 5d6a198f9dba12c78f82a0436d64bd683c031b13 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Mon, 8 Feb 2021 14:53:56 +0200 Subject: [PATCH] habanalabs: reset device in case of sync error As the F/wW is the first to detect out of sync event, a new event is added to notify the driver on such event. In which case the driver performs hard reset. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 18 +++++++++++++++++ drivers/misc/habanalabs/goya/goya.c | 23 ++++++++++++++++++++++ drivers/misc/habanalabs/include/common/cpucp_if.h | 9 +++++++++ .../habanalabs/include/gaudi/gaudi_async_events.h | 1 + .../habanalabs/include/goya/goya_async_events.h | 1 + 5 files changed, 52 insertions(+) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 9152242..c1f0023 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7097,6 +7097,15 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, } } +static void gaudi_print_out_of_sync_info(struct hl_device *hdev, + struct cpucp_pkt_sync_err *sync_err) +{ + struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ]; + + dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", + sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); +} + static int gaudi_soft_reset_late_init(struct hl_device *hdev) { struct gaudi_device *gaudi = hdev->asic_specific; @@ -7552,6 +7561,15 @@ static void gaudi_handle_eqe(struct hl_device *hdev, event_type, cause); break; + case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC: + gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); + else + hl_fw_unmask_irq(hdev, event_type); + break; + default: dev_err(hdev->dev, "Received invalid H/W interrupt %d\n", event_type); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index ed566c5..a40c428 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4401,6 +4401,8 @@ static const char *_goya_get_event_desc(u16 event_type) return "THERMAL_ENV_S"; case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E: return "THERMAL_ENV_E"; + case GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC: + return "QUEUE_OUT_OF_SYNC"; default: return "N/A"; } @@ -4483,6 +4485,9 @@ static void goya_get_event_desc(u16 event_type, char *desc, size_t size) index = event_type - GOYA_ASYNC_EVENT_ID_DMA_BM_CH0; snprintf(desc, size, _goya_get_event_desc(event_type), index); break; + case GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC: + snprintf(desc, size, _goya_get_event_desc(event_type)); + break; default: snprintf(desc, size, _goya_get_event_desc(event_type)); break; @@ -4534,6 +4539,15 @@ static void goya_print_mmu_error_info(struct hl_device *hdev) } } +static void goya_print_out_of_sync_info(struct hl_device *hdev, + struct cpucp_pkt_sync_err *sync_err) +{ + struct hl_hw_queue *q = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ]; + + dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", + sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); +} + static void goya_print_irq_info(struct hl_device *hdev, u16 event_type, bool razwi) { @@ -4754,6 +4768,15 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) goya_unmask_irq(hdev, event_type); break; + case GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC: + goya_print_irq_info(hdev, event_type, false); + goya_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); + else + hl_fw_unmask_irq(hdev, event_type); + break; + default: dev_err(hdev->dev, "Received invalid H/W interrupt %d\n", event_type); diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h index b77c1c1..bf4e790 100644 --- a/drivers/misc/habanalabs/include/common/cpucp_if.h +++ b/drivers/misc/habanalabs/include/common/cpucp_if.h @@ -28,6 +28,14 @@ #define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_SHIFT 6 #define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK 0x000007C0 +/* + * info of the pkt queue pointers in the first async occurrence + */ +struct cpucp_pkt_sync_err { + __le32 pi; + __le32 ci; +}; + struct hl_eq_hbm_ecc_data { /* SERR counter */ __le32 sec_cnt; @@ -77,6 +85,7 @@ struct hl_eq_entry { struct hl_eq_ecc_data ecc_data; struct hl_eq_hbm_ecc_data hbm_ecc_data; struct hl_eq_sm_sei_data sm_sei_data; + struct cpucp_pkt_sync_err pkt_sync_err; __le64 data[7]; }; }; diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h index 49335e8..0a0fa57 100644 --- a/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h +++ b/drivers/misc/habanalabs/include/gaudi/gaudi_async_events.h @@ -303,6 +303,7 @@ enum gaudi_async_event_id { GAUDI_EVENT_NIC3_QP1 = 619, GAUDI_EVENT_NIC4_QP0 = 620, GAUDI_EVENT_NIC4_QP1 = 621, + GAUDI_EVENT_PKT_QUEUE_OUT_SYNC = 647, GAUDI_EVENT_FIX_POWER_ENV_S = 658, GAUDI_EVENT_FIX_POWER_ENV_E = 659, GAUDI_EVENT_FIX_THERMAL_ENV_S = 660, diff --git a/drivers/misc/habanalabs/include/goya/goya_async_events.h b/drivers/misc/habanalabs/include/goya/goya_async_events.h index 5fb9236..0908140 100644 --- a/drivers/misc/habanalabs/include/goya/goya_async_events.h +++ b/drivers/misc/habanalabs/include/goya/goya_async_events.h @@ -188,6 +188,7 @@ enum goya_async_event_id { GOYA_ASYNC_EVENT_ID_HALT_MACHINE = 485, GOYA_ASYNC_EVENT_ID_INTS_REGISTER = 486, GOYA_ASYNC_EVENT_ID_SOFT_RESET = 487, + GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC = 506, GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S = 507, GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E = 508, GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S = 509, -- 2.7.4