From 9ec58ec7d41a08d79d996407b03b23da69990814 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 4 Jan 2021 10:02:35 -0800 Subject: [PATCH] scsi: lpfc: Fix NVMe recovery after mailbox timeout If a mailbox command times out, the SLI port is deemed in error and the port is reset. The HBA cleanup is not returning I/Os to the NVMe layer before the port is unregistered. This is due to the HBA being marked offline (!SLI_ACTIVE) and cleanup being done by the mailbox timeout handler rather than an general adapter reset routine. The mailbox timeout handler mailbox handler only cleaned up SCSI I/Os. Fix by reworking the mailbox handler to: - After handling the mailbox error, detect the board is already in failure (may be due to another error), and leave cleanup to the other handler. - If the mailbox command timeout is initial detector of the port error, continue with the board cleanup and marking the adapter offline (!SLI_ACTIVE). Remove the SCSI-only I/O cleanup routine. The generic reset adapter routine that is subsequently invoked, will clean up the I/Os. - Have the reset adapter routine flush all NVMe and SCSI I/Os if the adapter has been marked failed (!SLI_ACTIVE). - Rework the NVMe I/O terminate routine to take a status code to fail the I/O with and update so that cleaned up I/O calls the wqe completion routine. Currently it is bypassing the wqe cleanup and calling the NVMe I/O completion directly. The wqe completion routine will take care of data structure and node cleanup then call the NVMe I/O completion handler. Link: https://lore.kernel.org/r/20210104180240.46824-11-jsmart2021@gmail.com Co-developed-by: Dick Kennedy Signed-off-by: Dick Kennedy Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_crtn.h | 4 ++-- drivers/scsi/lpfc/lpfc_init.c | 8 ++++++-- drivers/scsi/lpfc/lpfc_nvme.c | 33 +++++++++++++++++---------------- drivers/scsi/lpfc/lpfc_sli.c | 20 ++++++++++++-------- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index f78e52a..e70db9e 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -255,7 +255,6 @@ void lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, int lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport, struct fc_frame_header *fc_hdr); void lpfc_nvmet_wqfull_process(struct lpfc_hba *phba, struct lpfc_queue *wq); -void lpfc_sli_flush_nvme_rings(struct lpfc_hba *phba); void lpfc_nvme_wait_for_io_drain(struct lpfc_hba *phba); void lpfc_sli4_build_dflt_fcf_record(struct lpfc_hba *, struct fcf_record *, uint16_t); @@ -598,7 +597,8 @@ void lpfc_release_io_buf(struct lpfc_hba *phba, struct lpfc_io_buf *ncmd, void lpfc_io_ktime(struct lpfc_hba *phba, struct lpfc_io_buf *ncmd); void lpfc_wqe_cmd_template(void); void lpfc_nvmet_cmd_template(void); -void lpfc_nvme_cancel_iocb(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn); +void lpfc_nvme_cancel_iocb(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn, + uint32_t stat, uint32_t param); extern int lpfc_enable_nvmet_cnt; extern unsigned long long lpfc_enable_nvmet[]; extern int lpfc_no_hba_reset_cnt; diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index af92676..c2619d5 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -6191,10 +6191,14 @@ lpfc_reset_hba(struct lpfc_hba *phba) phba->link_state = LPFC_HBA_ERROR; return; } - if (phba->sli.sli_flag & LPFC_SLI_ACTIVE) + + /* If not LPFC_SLI_ACTIVE, force all IO to be flushed */ + if (phba->sli.sli_flag & LPFC_SLI_ACTIVE) { lpfc_offline_prep(phba, LPFC_MBX_WAIT); - else + } else { lpfc_offline_prep(phba, LPFC_MBX_NO_WAIT); + lpfc_sli_flush_io_rings(phba); + } lpfc_offline(phba); lpfc_sli_brdrestart(phba); lpfc_online(phba); diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index fd4a1cf..e72c4cd 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -2596,14 +2596,17 @@ lpfc_nvme_wait_for_io_drain(struct lpfc_hba *phba) } void -lpfc_nvme_cancel_iocb(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn) +lpfc_nvme_cancel_iocb(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn, + uint32_t stat, uint32_t param) { #if (IS_ENABLED(CONFIG_NVME_FC)) struct lpfc_io_buf *lpfc_ncmd; struct nvmefc_fcp_req *nCmd; - struct lpfc_nvme_fcpreq_priv *freqpriv; + struct lpfc_wcqe_complete wcqe; + struct lpfc_wcqe_complete *wcqep = &wcqe; - if (!pwqeIn->context1) { + lpfc_ncmd = (struct lpfc_io_buf *)pwqeIn->context1; + if (!lpfc_ncmd) { lpfc_sli_release_iocbq(phba, pwqeIn); return; } @@ -2613,31 +2616,29 @@ lpfc_nvme_cancel_iocb(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn) lpfc_sli_release_iocbq(phba, pwqeIn); return; } - lpfc_ncmd = (struct lpfc_io_buf *)pwqeIn->context1; spin_lock(&lpfc_ncmd->buf_lock); - if (!lpfc_ncmd->nvmeCmd) { + nCmd = lpfc_ncmd->nvmeCmd; + if (!nCmd) { spin_unlock(&lpfc_ncmd->buf_lock); lpfc_release_nvme_buf(phba, lpfc_ncmd); return; } + spin_unlock(&lpfc_ncmd->buf_lock); - nCmd = lpfc_ncmd->nvmeCmd; lpfc_printf_log(phba, KERN_INFO, LOG_NVME_IOERR, "6194 NVME Cancel xri %x\n", lpfc_ncmd->cur_iocbq.sli4_xritag); - nCmd->transferred_length = 0; - nCmd->rcv_rsplen = 0; - nCmd->status = NVME_SC_INTERNAL; - freqpriv = nCmd->private; - freqpriv->nvme_buf = NULL; - lpfc_ncmd->nvmeCmd = NULL; - - spin_unlock(&lpfc_ncmd->buf_lock); - nCmd->done(nCmd); + wcqep->word0 = 0; + bf_set(lpfc_wcqe_c_status, wcqep, stat); + wcqep->parameter = param; + wcqep->word3 = 0; /* xb is 0 */ /* Call release with XB=1 to queue the IO into the abort list. */ - lpfc_release_nvme_buf(phba, lpfc_ncmd); + if (phba->sli.sli_flag & LPFC_SLI_ACTIVE) + bf_set(lpfc_wcqe_c_xb, wcqep, 1); + + (pwqeIn->wqe_cmpl)(phba, pwqeIn, wcqep); #endif } diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 735fa1d..dedea5d 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -1532,15 +1532,19 @@ lpfc_sli_cancel_iocbs(struct lpfc_hba *phba, struct list_head *iocblist, while (!list_empty(iocblist)) { list_remove_head(iocblist, piocb, struct lpfc_iocbq, list); - if (!piocb->iocb_cmpl) { + if (piocb->wqe_cmpl) { if (piocb->iocb_flag & LPFC_IO_NVME) - lpfc_nvme_cancel_iocb(phba, piocb); + lpfc_nvme_cancel_iocb(phba, piocb, + ulpstatus, ulpWord4); else lpfc_sli_release_iocbq(phba, piocb); - } else { + + } else if (piocb->iocb_cmpl) { piocb->iocb.ulpStatus = ulpstatus; piocb->iocb.un.ulpWord[4] = ulpWord4; (piocb->iocb_cmpl) (phba, piocb, piocb); + } else { + lpfc_sli_release_iocbq(phba, piocb); } } return; @@ -8269,8 +8273,10 @@ lpfc_mbox_timeout_handler(struct lpfc_hba *phba) struct lpfc_sli *psli = &phba->sli; - /* If the mailbox completed, process the completion and return */ - if (lpfc_sli4_process_missed_mbox_completions(phba)) + /* If the mailbox completed, process the completion */ + lpfc_sli4_process_missed_mbox_completions(phba); + + if (!(psli->sli_flag & LPFC_SLI_ACTIVE)) return; if (pmbox != NULL) @@ -8311,8 +8317,6 @@ lpfc_mbox_timeout_handler(struct lpfc_hba *phba) psli->sli_flag &= ~LPFC_SLI_ACTIVE; spin_unlock_irq(&phba->hbalock); - lpfc_sli_abort_fcp_rings(phba); - lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT, "0345 Resetting board due to mailbox timeout\n"); @@ -11783,7 +11787,7 @@ lpfc_sli_validate_fcp_iocb(struct lpfc_iocbq *iocbq, struct lpfc_vport *vport, struct lpfc_io_buf *lpfc_cmd; int rc = 1; - if (iocbq->vport != vport) + if (!iocbq || iocbq->vport != vport) return rc; if (!(iocbq->iocb_flag & LPFC_IO_FCP) || -- 2.7.4