RDMA/bnxt_re: use firmware provided max request timeout
authorKashyap Desai <kashyap.desai@broadcom.com>
Fri, 9 Jun 2023 11:01:52 +0000 (04:01 -0700)
committerLeon Romanovsky <leon@kernel.org>
Mon, 12 Jun 2023 07:10:54 +0000 (10:10 +0300)
Firmware provides max request timeout value as part of hwrm_ver_get
API. Driver gets the timeout from firmware and if that interface is
not available then fall back to hardcoded timeout value.
Also, Add a helper function to check the FW status.

Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Link: https://lore.kernel.org/r/1686308514-11996-16-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
drivers/infiniband/hw/bnxt_re/main.c
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
drivers/infiniband/hw/bnxt_re/qplib_res.h

index 8241154..a2c7d3f 100644 (file)
@@ -1041,6 +1041,7 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
        struct bnxt_en_dev *en_dev = rdev->en_dev;
        struct hwrm_ver_get_output resp = {0};
        struct hwrm_ver_get_input req = {0};
+       struct bnxt_qplib_chip_ctx *cctx;
        struct bnxt_fw_msg fw_msg;
        int rc = 0;
 
@@ -1058,11 +1059,18 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
                          rc);
                return;
        }
+
+       cctx = rdev->chip_ctx;
        rdev->qplib_ctx.hwrm_intf_ver =
                (u64)le16_to_cpu(resp.hwrm_intf_major) << 48 |
                (u64)le16_to_cpu(resp.hwrm_intf_minor) << 32 |
                (u64)le16_to_cpu(resp.hwrm_intf_build) << 16 |
                le16_to_cpu(resp.hwrm_intf_patch);
+
+       cctx->hwrm_cmd_max_timeout = le16_to_cpu(resp.max_req_timeout);
+
+       if (!cctx->hwrm_cmd_max_timeout)
+               cctx->hwrm_cmd_max_timeout = RCFW_FW_STALL_MAX_TIMEOUT;
 }
 
 static int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
index 8b1b413..99aa1ae 100644 (file)
@@ -90,6 +90,41 @@ static int bnxt_qplib_map_rc(u8 opcode)
 }
 
 /**
+ * bnxt_re_is_fw_stalled   -   Check firmware health
+ * @rcfw      -   rcfw channel instance of rdev
+ * @cookie    -   cookie to track the command
+ * @opcode    -   rcfw submitted for given opcode
+ * @cbit      -   bitmap entry of cookie
+ *
+ * If firmware has not responded any rcfw command within
+ * rcfw->max_timeout, consider firmware as stalled.
+ *
+ * Returns:
+ * 0 if firmware is responding
+ * -ENODEV if firmware is not responding
+ */
+static int bnxt_re_is_fw_stalled(struct bnxt_qplib_rcfw *rcfw,
+                                u16 cookie, u8 opcode, u16 cbit)
+{
+       struct bnxt_qplib_cmdq_ctx *cmdq;
+
+       cmdq = &rcfw->cmdq;
+
+       if (time_after(jiffies, cmdq->last_seen +
+                     (rcfw->max_timeout * HZ))) {
+               dev_warn_ratelimited(&rcfw->pdev->dev,
+                                    "%s: FW STALL Detected. cmdq[%#x]=%#x waited (%d > %d) msec active %d ",
+                                    __func__, cookie, opcode,
+                                    jiffies_to_msecs(jiffies - cmdq->last_seen),
+                                    rcfw->max_timeout * 1000,
+                                    test_bit(cbit, cmdq->cmdq_bitmap));
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+/**
  * __wait_for_resp   - Don't hold the cpu context and wait for response
  * @rcfw      -   rcfw channel instance of rdev
  * @cookie    -   cookie to track the command
@@ -105,6 +140,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode)
 {
        struct bnxt_qplib_cmdq_ctx *cmdq;
        u16 cbit;
+       int ret;
 
        cmdq = &rcfw->cmdq;
        cbit = cookie % rcfw->cmdq_depth;
@@ -118,8 +154,8 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode)
                wait_event_timeout(cmdq->waitq,
                                   !test_bit(cbit, cmdq->cmdq_bitmap) ||
                                   test_bit(ERR_DEVICE_DETACHED, &cmdq->flags),
-                                  msecs_to_jiffies(RCFW_FW_STALL_TIMEOUT_SEC
-                                                   * 1000));
+                                  msecs_to_jiffies(rcfw->max_timeout * 1000));
+
                if (!test_bit(cbit, cmdq->cmdq_bitmap))
                        return 0;
 
@@ -128,10 +164,9 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode)
                if (!test_bit(cbit, cmdq->cmdq_bitmap))
                        return 0;
 
-               /* Firmware stall is detected */
-               if (time_after(jiffies, cmdq->last_seen +
-                             (RCFW_FW_STALL_TIMEOUT_SEC * HZ)))
-                       return -ENODEV;
+               ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, cbit);
+               if (ret)
+                       return ret;
 
        } while (true);
 };
@@ -352,6 +387,7 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie,
        struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq;
        unsigned long issue_time;
        u16 cbit;
+       int ret;
 
        cbit = cookie % rcfw->cmdq_depth;
        issue_time = jiffies;
@@ -368,11 +404,10 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie,
                if (!test_bit(cbit, cmdq->cmdq_bitmap))
                        return 0;
                if (jiffies_to_msecs(jiffies - issue_time) >
-                   (RCFW_FW_STALL_TIMEOUT_SEC * 1000)) {
-                       /* Firmware stall is detected */
-                       if (time_after(jiffies, cmdq->last_seen +
-                                     (RCFW_FW_STALL_TIMEOUT_SEC * HZ)))
-                               return -ENODEV;
+                   (rcfw->max_timeout * 1000)) {
+                       ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, cbit);
+                       if (ret)
+                               return ret;
                }
        } while (true);
 };
@@ -951,6 +986,8 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
        if (!rcfw->qp_tbl)
                goto fail;
 
+       rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout;
+
        return 0;
 
 fail:
index 338bf6a..b644dcc 100644 (file)
@@ -51,7 +51,7 @@
 
 #define RCFW_DBR_PCI_BAR_REGION                2
 #define RCFW_DBR_BASE_PAGE_SHIFT       12
-#define RCFW_FW_STALL_TIMEOUT_SEC      40
+#define RCFW_FW_STALL_MAX_TIMEOUT      40
 
 /* Cmdq contains a fix number of a 16-Byte slots */
 struct bnxt_qplib_cmdqe {
@@ -227,6 +227,8 @@ struct bnxt_qplib_rcfw {
        atomic_t rcfw_intr_enabled;
        struct semaphore rcfw_inflight;
        atomic_t timeout_send;
+       /* cached from chip cctx for quick reference in slow path */
+       u16 max_timeout;
 };
 
 struct bnxt_qplib_cmdqmsg {
index 982e2c9..77f0b84 100644 (file)
@@ -55,6 +55,7 @@ struct bnxt_qplib_chip_ctx {
        u8      chip_rev;
        u8      chip_metal;
        u16     hw_stats_size;
+       u16     hwrm_cmd_max_timeout;
        struct bnxt_qplib_drv_modes modes;
 };