IB/hfi1: Ignore LNI errors before DC8051 transitions to Polling state
authorKaike Wan <kaike.wan@intel.com>
Wed, 28 Nov 2018 18:19:04 +0000 (10:19 -0800)
committerJason Gunthorpe <jgg@mellanox.com>
Fri, 7 Dec 2018 02:50:08 +0000 (19:50 -0700)
When it is requested to change its physical state back to Offline while in
the process to go up, DC8051 will set the ERROR field in the
DC8051_DBG_ERR_INFO_SET_BY_8051 register. This ERROR field will remain
until the next time when DC8051 transitions from Offline to Polling.
Subsequently, when the host requests DC8051 to change its physical state
to Polling again, it may receive a DC8051 interrupt with the stale ERROR
field still in DC8051_DBG_ERR_INFO_SET_BY_8051. If the host link state has
been changed to Polling, this stale ERROR will force the host to
transition to Offline state, resulting in a vicious cycle of Polling
->Offline->Polling->Offline. On the other hand, if the host link state is
still Offline when the stale ERROR is received, the stale ERROR will be
ignored, and the link will come up correctly.  This patch implements the
correct behavior by changing host link state to Polling only after DC8051
changes its physical state to Polling.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Krzysztof Goreczny <krzysztof.goreczny@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
drivers/infiniband/hw/hfi1/chip.c

index 9b20479..385c337 100644 (file)
@@ -1072,6 +1072,8 @@ static void log_state_transition(struct hfi1_pportdata *ppd, u32 state);
 static void log_physical_state(struct hfi1_pportdata *ppd, u32 state);
 static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
                                   int msecs);
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+                                        int msecs);
 static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
 static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
 static void handle_temp_err(struct hfi1_devdata *dd);
@@ -10770,13 +10772,15 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
                        break;
 
                ppd->port_error_action = 0;
-               ppd->host_link_state = HLS_DN_POLL;
 
                if (quick_linkup) {
                        /* quick linkup does not go into polling */
                        ret = do_quick_linkup(dd);
                } else {
                        ret1 = set_physical_link_state(dd, PLS_POLLING);
+                       if (!ret1)
+                               ret1 = wait_phys_link_out_of_offline(ppd,
+                                                                    3000);
                        if (ret1 != HCMD_SUCCESS) {
                                dd_dev_err(dd,
                                           "Failed to transition to Polling link state, return 0x%x\n",
@@ -10784,6 +10788,14 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
                                ret = -EINVAL;
                        }
                }
+
+               /*
+                * Change the host link state after requesting DC8051 to
+                * change its physical state so that we can ignore any
+                * interrupt with stale LNI(XX) error, which will not be
+                * cleared until DC8051 transitions to Polling state.
+                */
+               ppd->host_link_state = HLS_DN_POLL;
                ppd->offline_disabled_reason =
                        HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
                /*
@@ -12927,6 +12939,39 @@ static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd,
        return read_state;
 }
 
+/*
+ * wait_phys_link_out_of_offline - wait for any out of offline state
+ * @ppd: port device
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for any out of offline physical link
+ * state change to occur.
+ * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
+ */
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+                                        int msecs)
+{
+       u32 read_state;
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(msecs);
+       while (1) {
+               read_state = read_physical_state(ppd->dd);
+               if ((read_state & 0xF0) != PLS_OFFLINE)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(ppd->dd,
+                                  "timeout waiting for phy link out of offline. Read state 0x%x, %dms\n",
+                                  read_state, msecs);
+                       return -ETIMEDOUT;
+               }
+               usleep_range(1950, 2050); /* sleep 2ms-ish */
+       }
+
+       log_state_transition(ppd, read_state);
+       return read_state;
+}
+
 #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
 (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)