From bafed3f231f7037ce881de2278c14a679ee9c937 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sat, 5 Mar 2022 03:54:41 -0500 Subject: [PATCH] bnxt_en: implement hw health reporter This reporter will report NVM errors which are non-fatal. When we receive these NVM error events, we'll report it through this new hw health reporter. Reviewed-by: Edwin Peer Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 19 ++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 33 ++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 73 +++++++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h | 1 + 4 files changed, 126 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2de0295..63b8fc4f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2061,6 +2061,22 @@ static void bnxt_event_error_report(struct bnxt *bp, u32 data1, u32 data2) case ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD: netdev_warn(bp->dev, "One or more MMIO doorbells dropped by the device!\n"); break; + case ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM: { + struct bnxt_hw_health *hw_health = &bp->hw_health; + + hw_health->nvm_err_address = EVENT_DATA2_NVM_ERR_ADDR(data2); + if (EVENT_DATA1_NVM_ERR_TYPE_WRITE(data1)) { + hw_health->synd = BNXT_HW_STATUS_NVM_WRITE_ERR; + hw_health->nvm_write_errors++; + } else if (EVENT_DATA1_NVM_ERR_TYPE_ERASE(data1)) { + hw_health->synd = BNXT_HW_STATUS_NVM_ERASE_ERR; + hw_health->nvm_erase_errors++; + } else { + hw_health->synd = BNXT_HW_STATUS_NVM_UNKNOWN_ERR; + } + set_bit(BNXT_FW_NVM_ERR_SP_EVENT, &bp->sp_event); + break; + } default: netdev_err(bp->dev, "FW reported unknown error type %u\n", err_type); @@ -11887,6 +11903,9 @@ static void bnxt_sp_task(struct work_struct *work) if (test_and_clear_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event)) bnxt_fw_echo_reply(bp); + if (test_and_clear_bit(BNXT_FW_NVM_ERR_SP_EVENT, &bp->sp_event)) + bnxt_devlink_health_hw_report(bp); + /* These functions below will clear BNXT_STATE_IN_SP_TASK. They * must be the last functions to be called before exiting. */ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 447a940..fa0df43d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -516,6 +516,21 @@ struct rx_tpa_end_cmp_ext { ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_MASK) >>\ ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_SFT) +#define EVENT_DATA2_NVM_ERR_ADDR(data2) \ + (((data2) & \ + ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_MASK) >>\ + ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_SFT) + +#define EVENT_DATA1_NVM_ERR_TYPE_WRITE(data1) \ + (((data1) & \ + ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK) ==\ + ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_WRITE) + +#define EVENT_DATA1_NVM_ERR_TYPE_ERASE(data1) \ + (((data1) & \ + ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK) ==\ + ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_ERASE) + struct nqe_cn { __le16 type; #define NQ_CN_TYPE_MASK 0x3fUL @@ -1528,6 +1543,21 @@ struct bnxt_ctx_mem_info { struct bnxt_mem_init mem_init[BNXT_CTX_MEM_INIT_MAX]; }; +enum bnxt_hw_err { + BNXT_HW_STATUS_HEALTHY = 0x0, + BNXT_HW_STATUS_NVM_WRITE_ERR = 0x1, + BNXT_HW_STATUS_NVM_ERASE_ERR = 0x2, + BNXT_HW_STATUS_NVM_UNKNOWN_ERR = 0x3, +}; + +struct bnxt_hw_health { + u32 nvm_err_address; + u32 nvm_write_errors; + u32 nvm_erase_errors; + u8 synd; + struct devlink_health_reporter *hw_reporter; +}; + enum bnxt_health_severity { SEVERITY_NORMAL = 0, SEVERITY_WARNING, @@ -2045,6 +2075,7 @@ struct bnxt { #define BNXT_FW_EXCEPTION_SP_EVENT 19 #define BNXT_LINK_CFG_CHANGE_SP_EVENT 21 #define BNXT_FW_ECHO_REQUEST_SP_EVENT 23 +#define BNXT_FW_NVM_ERR_SP_EVENT 25 struct delayed_work fw_reset_task; int fw_reset_state; @@ -2145,6 +2176,8 @@ struct bnxt { struct dentry *debugfs_pdev; struct device *hwmon_dev; enum board_idx board_idx; + + struct bnxt_hw_health hw_health; }; #define BNXT_NUM_RX_RING_STATS 8 diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 0c17f90..a802bbd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -241,6 +241,69 @@ static const struct devlink_health_reporter_ops bnxt_dl_fw_reporter_ops = { .recover = bnxt_fw_recover, }; +static int bnxt_hw_recover(struct devlink_health_reporter *reporter, + void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct bnxt *bp = devlink_health_reporter_priv(reporter); + struct bnxt_hw_health *hw_health = &bp->hw_health; + + hw_health->synd = BNXT_HW_STATUS_HEALTHY; + return 0; +} + +static const char *hw_err_str(u8 synd) +{ + switch (synd) { + case BNXT_HW_STATUS_HEALTHY: + return "healthy"; + case BNXT_HW_STATUS_NVM_WRITE_ERR: + return "nvm write error"; + case BNXT_HW_STATUS_NVM_ERASE_ERR: + return "nvm erase error"; + case BNXT_HW_STATUS_NVM_UNKNOWN_ERR: + return "unrecognized nvm error"; + default: + return "unknown hw error"; + } +} + +static int bnxt_hw_diagnose(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) +{ + struct bnxt *bp = devlink_health_reporter_priv(reporter); + struct bnxt_hw_health *h = &bp->hw_health; + int rc; + + rc = devlink_fmsg_string_pair_put(fmsg, "Status", hw_err_str(h->synd)); + if (rc) + return rc; + rc = devlink_fmsg_u32_pair_put(fmsg, "nvm_write_errors", h->nvm_write_errors); + if (rc) + return rc; + rc = devlink_fmsg_u32_pair_put(fmsg, "nvm_erase_errors", h->nvm_erase_errors); + if (rc) + return rc; + return 0; +} + +void bnxt_devlink_health_hw_report(struct bnxt *bp) +{ + struct bnxt_hw_health *hw_health = &bp->hw_health; + + netdev_warn(bp->dev, "%s reported at address 0x%x\n", hw_err_str(hw_health->synd), + hw_health->nvm_err_address); + + devlink_health_report(hw_health->hw_reporter, hw_err_str(hw_health->synd), NULL); +} + +static const struct devlink_health_reporter_ops bnxt_dl_hw_reporter_ops = { + .name = "hw", + .diagnose = bnxt_hw_diagnose, + .recover = bnxt_hw_recover, +}; + static struct devlink_health_reporter * __bnxt_dl_reporter_create(struct bnxt *bp, const struct devlink_health_reporter_ops *ops) @@ -260,6 +323,10 @@ __bnxt_dl_reporter_create(struct bnxt *bp, void bnxt_dl_fw_reporters_create(struct bnxt *bp) { struct bnxt_fw_health *fw_health = bp->fw_health; + struct bnxt_hw_health *hw_health = &bp->hw_health; + + if (!hw_health->hw_reporter) + hw_health->hw_reporter = __bnxt_dl_reporter_create(bp, &bnxt_dl_hw_reporter_ops); if (fw_health && !fw_health->fw_reporter) fw_health->fw_reporter = __bnxt_dl_reporter_create(bp, &bnxt_dl_fw_reporter_ops); @@ -268,6 +335,12 @@ void bnxt_dl_fw_reporters_create(struct bnxt *bp) void bnxt_dl_fw_reporters_destroy(struct bnxt *bp) { struct bnxt_fw_health *fw_health = bp->fw_health; + struct bnxt_hw_health *hw_health = &bp->hw_health; + + if (hw_health->hw_reporter) { + devlink_health_reporter_destroy(hw_health->hw_reporter); + hw_health->hw_reporter = NULL; + } if (fw_health && fw_health->fw_reporter) { devlink_health_reporter_destroy(fw_health->fw_reporter); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h index b810506..056962e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h @@ -74,6 +74,7 @@ enum bnxt_dl_version_type { void bnxt_devlink_health_fw_report(struct bnxt *bp); void bnxt_dl_health_fw_status_update(struct bnxt *bp, bool healthy); void bnxt_dl_health_fw_recovery_done(struct bnxt *bp); +void bnxt_devlink_health_hw_report(struct bnxt *bp); void bnxt_dl_fw_reporters_create(struct bnxt *bp); void bnxt_dl_fw_reporters_destroy(struct bnxt *bp); int bnxt_dl_register(struct bnxt *bp); -- 2.7.4