scsi: ufs: Fix a race condition between error handler and runtime PM ops

author Can Guo <cang@codeaurora.org>

Sun, 9 Aug 2020 12:15:54 +0000 (05:15 -0700)

committer Martin K. Petersen <martin.petersen@oracle.com>

Tue, 18 Aug 2020 00:54:56 +0000 (20:54 -0400)
author Can Guo <cang@codeaurora.org>
Sun, 9 Aug 2020 12:15:54 +0000 (05:15 -0700)
committer Martin K. Petersen <martin.petersen@oracle.com>
Tue, 18 Aug 2020 00:54:56 +0000 (20:54 -0400)
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c

index 602c746..9ebb5cd 100644 (file)
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -126,7 +126,8 @@ enum {
         UFSHCD_STATE_RESET,
         UFSHCD_STATE_ERROR,
         UFSHCD_STATE_OPERATIONAL,
-       UFSHCD_STATE_EH_SCHEDULED,
+       UFSHCD_STATE_EH_SCHEDULED_FATAL,
+       UFSHCD_STATE_EH_SCHEDULED_NON_FATAL,
  };
  
  /* UFSHCD error handling flags */
@@ -2515,34 +2516,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
         if (!down_read_trylock(&hba->clk_scaling_lock))
                 return SCSI_MLQUEUE_HOST_BUSY;
  
-       spin_lock_irqsave(hba->host->host_lock, flags);
-       switch (hba->ufshcd_state) {
-       case UFSHCD_STATE_OPERATIONAL:
-               break;
-       case UFSHCD_STATE_EH_SCHEDULED:
-       case UFSHCD_STATE_RESET:
-               err = SCSI_MLQUEUE_HOST_BUSY;
-               goto out_unlock;
-       case UFSHCD_STATE_ERROR:
-               set_host_byte(cmd, DID_ERROR);
-               cmd->scsi_done(cmd);
-               goto out_unlock;
-       default:
-               dev_WARN_ONCE(hba->dev, 1, "%s: invalid state %d\n",
-                               __func__, hba->ufshcd_state);
-               set_host_byte(cmd, DID_BAD_TARGET);
-               cmd->scsi_done(cmd);
-               goto out_unlock;
-       }
-
-       /* if error handling is in progress, don't issue commands */
-       if (ufshcd_eh_in_progress(hba)) {
-               set_host_byte(cmd, DID_ERROR);
-               cmd->scsi_done(cmd);
-               goto out_unlock;
-       }
-       spin_unlock_irqrestore(hba->host->host_lock, flags);
-
         hba->req_abort_count = 0;
  
         err = ufshcd_hold(hba, true);
@@ -2578,11 +2551,51 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
         /* Make sure descriptors are ready before ringing the doorbell */
         wmb();
  
-       /* issue command to the controller */
         spin_lock_irqsave(hba->host->host_lock, flags);
+       switch (hba->ufshcd_state) {
+       case UFSHCD_STATE_OPERATIONAL:
+       case UFSHCD_STATE_EH_SCHEDULED_NON_FATAL:
+               break;
+       case UFSHCD_STATE_EH_SCHEDULED_FATAL:
+               /*
+                * pm_runtime_get_sync() is used at error handling preparation
+                * stage. If a scsi cmd, e.g. the SSU cmd, is sent from hba's
+                * PM ops, it can never be finished if we let SCSI layer keep
+                * retrying it, which gets err handler stuck forever. Neither
+                * can we let the scsi cmd pass through, because UFS is in bad
+                * state, the scsi cmd may eventually time out, which will get
+                * err handler blocked for too long. So, just fail the scsi cmd
+                * sent from PM ops, err handler can recover PM error anyways.
+                */
+               if (hba->pm_op_in_progress) {
+                       hba->force_reset = true;
+                       set_host_byte(cmd, DID_BAD_TARGET);
+                       goto out_compl_cmd;
+               }
+               fallthrough;
+       case UFSHCD_STATE_RESET:
+               err = SCSI_MLQUEUE_HOST_BUSY;
+               goto out_compl_cmd;
+       case UFSHCD_STATE_ERROR:
+               set_host_byte(cmd, DID_ERROR);
+               goto out_compl_cmd;
+       default:
+               dev_WARN_ONCE(hba->dev, 1, "%s: invalid state %d\n",
+                               __func__, hba->ufshcd_state);
+               set_host_byte(cmd, DID_BAD_TARGET);
+               goto out_compl_cmd;
+       }
         ufshcd_send_command(hba, tag);
-out_unlock:
         spin_unlock_irqrestore(hba->host->host_lock, flags);
+       goto out;
+
+out_compl_cmd:
+       scsi_dma_unmap(lrbp->cmd);
+       lrbp->cmd = NULL;
+       spin_unlock_irqrestore(hba->host->host_lock, flags);
+       ufshcd_release(hba);
+       if (!err)
+               cmd->scsi_done(cmd);
  out:
         up_read(&hba->clk_scaling_lock);
         return err;
@@ -5552,9 +5565,12 @@ static inline void ufshcd_schedule_eh_work(struct ufs_hba *hba)
  {
         /* handle fatal errors only when link is not in error state */
         if (hba->ufshcd_state != UFSHCD_STATE_ERROR) {
-               hba->ufshcd_state = UFSHCD_STATE_EH_SCHEDULED;
-               if (queue_work(hba->eh_wq, &hba->eh_work))
-                       ufshcd_scsi_block_requests(hba);
+               if (hba->force_reset || ufshcd_is_link_broken(hba) ||
+                   ufshcd_is_saved_err_fatal(hba))
+                       hba->ufshcd_state = UFSHCD_STATE_EH_SCHEDULED_FATAL;
+               else
+                       hba->ufshcd_state = UFSHCD_STATE_EH_SCHEDULED_NON_FATAL;
+               queue_work(hba->eh_wq, &hba->eh_work);
         }
  }
  
@@ -5664,6 +5680,7 @@ static void ufshcd_err_handler(struct work_struct *work)
         spin_unlock_irqrestore(hba->host->host_lock, flags);
         ufshcd_err_handling_prepare(hba);
         spin_lock_irqsave(hba->host->host_lock, flags);
+       ufshcd_scsi_block_requests(hba);
         /*
          * A full reset and restore might have happened after preparation
          * is finished, double check whether we should stop.
author	Can Guo <cang@codeaurora.org>
	Sun, 9 Aug 2020 12:15:54 +0000 (05:15 -0700)
committer	Martin K. Petersen <martin.petersen@oracle.com>
	Tue, 18 Aug 2020 00:54:56 +0000 (20:54 -0400)