RDMA/hns: Add the detection for CMDQ status in the device initialization process
authorYangyang Li <liyangyang20@huawei.com>
Fri, 29 Apr 2022 09:31:04 +0000 (17:31 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 9 Jun 2022 08:23:10 +0000 (10:23 +0200)
[ Upstream commit e8ea058edc2b225a68b307057a65599625daaebf ]

CMDQ may fail during HNS ROCEE initialization. The following is the log
when the execution fails:

  hns3 0000:bd:00.2: In reset process RoCE client reinit.
  hns3 0000:bd:00.2: CMDQ move tail from 840 to 839
  hns3 0000:bd:00.2 hns_2: failed to set gid, ret = -11!
  hns3 0000:bd:00.2: CMDQ move tail from 840 to 839
  <...>
  hns3 0000:bd:00.2: CMDQ move tail from 840 to 839
  hns3 0000:bd:00.2: CMDQ move tail from 840 to 0
  hns3 0000:bd:00.2: [cmd]token 14e mailbox 20 timeout.
  hns3 0000:bd:00.2 hns_2: set HEM step 0 failed!
  hns3 0000:bd:00.2 hns_2: set HEM address to HW failed!
  hns3 0000:bd:00.2 hns_2: failed to alloc mtpt, ret = -16.
  infiniband hns_2: Couldn't create ib_mad PD
  infiniband hns_2: Couldn't open port 1
  hns3 0000:bd:00.2: Reset done, RoCE client reinit finished.

However, even if ib_mad client registration failed, ib_register_device()
still returns success to the driver.

In the device initialization process, CMDQ execution fails because HW/FW
is abnormal. Therefore, if CMDQ fails, the initialization function should
set CMDQ to a fatal error state and return a failure to the caller.

Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver")
Link: https://lore.kernel.org/r/20220429093104.26687-1-liangwenpeng@huawei.com
Signed-off-by: Yangyang Li <liyangyang20@huawei.com>
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/infiniband/hw/hns/hns_roce_device.h
drivers/infiniband/hw/hns/hns_roce_hw_v2.c

index 9467c39..c4cc51f 100644 (file)
@@ -559,6 +559,11 @@ struct hns_roce_cmd_context {
        u16                     busy;
 };
 
+enum hns_roce_cmdq_state {
+       HNS_ROCE_CMDQ_STATE_NORMAL,
+       HNS_ROCE_CMDQ_STATE_FATAL_ERR,
+};
+
 struct hns_roce_cmdq {
        struct dma_pool         *pool;
        struct semaphore        poll_sem;
@@ -578,6 +583,7 @@ struct hns_roce_cmdq {
         * close device, switch into poll mode(non event mode)
         */
        u8                      use_events;
+       enum hns_roce_cmdq_state state;
 };
 
 struct hns_roce_cmd_mailbox {
index 96fe73b..6ed040a 100644 (file)
@@ -1273,6 +1273,16 @@ static int hns_roce_cmq_csq_done(struct hns_roce_dev *hr_dev)
        return tail == priv->cmq.csq.head;
 }
 
+static void update_cmdq_status(struct hns_roce_dev *hr_dev)
+{
+       struct hns_roce_v2_priv *priv = hr_dev->priv;
+       struct hnae3_handle *handle = priv->handle;
+
+       if (handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT ||
+           handle->rinfo.instance_state == HNS_ROCE_STATE_INIT)
+               hr_dev->cmd.state = HNS_ROCE_CMDQ_STATE_FATAL_ERR;
+}
+
 static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
                               struct hns_roce_cmq_desc *desc, int num)
 {
@@ -1326,6 +1336,8 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
                         csq->head, tail);
                csq->head = tail;
 
+               update_cmdq_status(hr_dev);
+
                ret = -EAGAIN;
        }
 
@@ -1340,6 +1352,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
        bool busy;
        int ret;
 
+       if (hr_dev->cmd.state == HNS_ROCE_CMDQ_STATE_FATAL_ERR)
+               return -EIO;
+
        if (!v2_chk_mbox_is_avail(hr_dev, &busy))
                return busy ? -EBUSY : 0;
 
@@ -1536,6 +1551,9 @@ static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
 {
        int i;
 
+       if (hr_dev->cmd.state == HNS_ROCE_CMDQ_STATE_FATAL_ERR)
+               return;
+
        for (i = hr_dev->func_num - 1; i >= 0; i--) {
                __hns_roce_function_clear(hr_dev, i);
                if (i != 0)
@@ -2818,6 +2836,9 @@ static int v2_wait_mbox_complete(struct hns_roce_dev *hr_dev, u32 timeout,
        mb_st = (struct hns_roce_mbox_status *)desc.data;
        end = msecs_to_jiffies(timeout) + jiffies;
        while (v2_chk_mbox_is_avail(hr_dev, &busy)) {
+               if (hr_dev->cmd.state == HNS_ROCE_CMDQ_STATE_FATAL_ERR)
+                       return -EIO;
+
                status = 0;
                hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_MB_ST,
                                              true);